[viennacl] 06/09: Reset sources

Wed Feb 19 19:09:56 UTC 2014

This is an automated email from the git hooks/post-receive script.

tsmithe-guest pushed a commit to branch master
in repository viennacl.

commit 48667a27e7ef2391ad62dec6720296cbcc7d1a70
Author: Toby Smithe <git at tsmithe.net>
Date:   Wed Feb 19 16:20:47 2014 +0000

    Reset sources
---
 auxiliary/CMakeLists.txt                           | 345 ------------
 auxiliary/converter.cpp                            | 383 -------------
 auxiliary/ell_matrix/align1/vec_mul.cl             |  38 --
 auxiliary/generate-blas3-prod-align1.cpp           | 285 ----------
 auxiliary/generate-blas3-prod16-align1.cpp         | 282 ----------
 auxiliary/hyb_matrix/align1/vec_mul.cl             |  49 --
 auxiliary/matrix_col/align1/add.cl                 |  36 --
 auxiliary/matrix_col/align1/assign.cl              |  27 -
 auxiliary/matrix_col/align1/clear.cl               |  16 -
 auxiliary/matrix_col/align1/cpu_inplace_mult.cl    |  18 -
 auxiliary/matrix_col/align1/inplace_add.cl         |  26 -
 auxiliary/matrix_col/align1/inplace_divide.cl      |  19 -
 auxiliary/matrix_col/align1/inplace_mult.cl        |  20 -
 auxiliary/matrix_col/align1/inplace_sub.cl         |  27 -
 auxiliary/matrix_col/align1/sub.cl                 |  36 --
 auxiliary/matrix_col/align1/trans_vec_mul.cl       |  28 -
 auxiliary/matrix_col/align1/vec_mul.cl             |  28 -
 auxiliary/matrix_row/align1/add.cl                 |  37 --
 auxiliary/matrix_row/align1/assign.cl              |  27 -
 auxiliary/matrix_row/align1/clear.cl               |  16 -
 auxiliary/matrix_row/align1/cpu_inplace_mult.cl    |  17 -
 auxiliary/matrix_row/align1/inplace_add.cl         |  27 -
 auxiliary/matrix_row/align1/inplace_divide.cl      |  18 -
 auxiliary/matrix_row/align1/inplace_mult.cl        |  20 -
 auxiliary/matrix_row/align1/inplace_sub.cl         |  26 -
 auxiliary/matrix_row/align1/sub.cl                 |  36 --
 auxiliary/matrix_row/align1/trans_vec_mul.cl       |  29 -
 auxiliary/matrix_row/align1/vec_mul.cl             |  30 --
 auxiliary/nmf/align1/el_wise_mul_div.cl            |  14 -
 auxiliary/nmf/align1/el_wise_mul_div.cl~           |  13 -
 auxiliary/nmf/align1/sub_wise.cl                   |  10 -
 auxiliary/nmf/align1/sub_wise.cl~                  |  13 -
 auxiliary/svd/align1/bidiag_pack.cl                |  19 -
 auxiliary/svd/align1/copy_col.cl                   |  17 -
 auxiliary/svd/align1/copy_row.cl                   |  17 -
 auxiliary/svd/align1/givens_prev.cl                |  59 --
 auxiliary/svd/align1/house_col.cl                  |  59 --
 auxiliary/svd/align1/house_row.cl                  |  71 ---
 auxiliary/svd/align1/inverse_signs.cl              |  16 -
 auxiliary/svd/align1/transpose_inplace.cl          |  25 -
 auxiliary/vector/align1/add.cl                     |  19 -
 auxiliary/vector/align1/assign.cl                  |  15 -
 auxiliary/vector/align1/clear.cl                   |  11 -
 auxiliary/vector/align1/cpu_inplace_mul_add.cl     |  16 -
 auxiliary/vector/align1/cpu_inplace_mult.cl        |  12 -
 auxiliary/vector/align1/cpu_mul_add.cl             |  21 -
 auxiliary/vector/align1/cpu_mult.cl                |  17 -
 auxiliary/vector/align1/diag_precond.cl            |  14 -
 auxiliary/vector/align1/divide.cl                  |  18 -
 auxiliary/vector/align1/index_norm_inf.cl          |  58 --
 auxiliary/vector/align1/inner_prod.cl              |  64 ---
 auxiliary/vector/align1/inplace_add.cl             |  15 -
 auxiliary/vector/align1/inplace_div_add.cl         |  17 -
 auxiliary/vector/align1/inplace_div_sub.cl         |  18 -
 auxiliary/vector/align1/inplace_divide.cl          |  13 -
 auxiliary/vector/align1/inplace_mul_add.cl         |  18 -
 auxiliary/vector/align1/inplace_mul_sub.cl         |  18 -
 auxiliary/vector/align1/inplace_mult.cl            |  14 -
 auxiliary/vector/align1/inplace_sub.cl             |  15 -
 auxiliary/vector/align1/mul_add.cl                 |  23 -
 auxiliary/vector/align1/mul_sub.cl                 |  23 -
 auxiliary/vector/align1/mult.cl                    |  17 -
 auxiliary/vector/align1/norm_1.cl                  |  49 --
 auxiliary/vector/align1/norm_2.cl                  |  52 --
 auxiliary/vector/align1/norm_inf.cl                |  43 --
 auxiliary/vector/align1/plane_rotation.cl          |  28 -
 auxiliary/vector/align1/sqrt_sum.cl                |  22 -
 auxiliary/vector/align1/sub.cl                     |  19 -
 auxiliary/vector/align1/sum.cl                     |  21 -
 auxiliary/vector/align1/swap.cl                    |  23 -
 auxiliary/vector/align1/vmax.cl                    |  22 -
 auxiliary/vector/align16/add.cl                    |  21 -
 auxiliary/vector/align16/cpu_inplace_mul.cl        |  13 -
 auxiliary/vector/align16/cpu_mult.cl               |  17 -
 auxiliary/vector/align16/divide.cl                 |  20 -
 auxiliary/vector/align16/inplace_add.cl            |  16 -
 auxiliary/vector/align16/inplace_divide.cl         |  15 -
 auxiliary/vector/align16/inplace_mult.cl           |  14 -
 auxiliary/vector/align16/inplace_sub.cl            |  17 -
 auxiliary/vector/align16/mult.cl                   |  18 -
 auxiliary/vector/align16/sub.cl                    |  21 -
 auxiliary/vector/align4/cpu_inplace_mul_add.cl     |  17 -
 auxiliary/vector/align4/cpu_mul_add.cl             |  21 -
 auxiliary/vector/align4/inplace_div_add.cl         |  20 -
 auxiliary/vector/align4/inplace_div_sub.cl         |  20 -
 auxiliary/vector/align4/inplace_mul_add.cl         |  18 -
 auxiliary/vector/align4/inplace_mul_sub.cl         |  19 -
 auxiliary/vector/align4/mul_add.cl                 |  22 -
 examples/tutorial/iterative-ublas.cpp~             | 163 ------
 tests/src/generator_inner_product.cpp              | 172 ------
 tests/src/generator_matrix.cpp                     | 219 --------
 tests/src/generator_matrix_vector_product.cpp      | 234 --------
 tests/src/generator_vector.cpp                     | 331 ------------
 tests/src/matrix.cpp                               | 533 -------------------
 tests/src/matrix_range.cpp                         | 558 -------------------
 tests/src/matrix_slice.cpp                         | 563 --------------------
 tests/src/vector_range.cpp                         | 396 --------------
 tests/src/vector_slice.cpp                         | 396 --------------
 viennacl/generator/compound_node.hpp               | 199 -------
 viennacl/generator/custom_operation.hpp            | 268 ----------
 viennacl/generator/elementwise_modifier.hpp        |  93 ----
 viennacl/generator/get_kernels_infos.hpp           | 579 --------------------
 viennacl/generator/make_code/expression.hpp        | 163 ------
 viennacl/generator/make_code/inner_product.hpp     | 131 -----
 viennacl/generator/make_code/make_code.hpp         |  33 --
 .../generator/make_code/matrix-vector_product.hpp  | 143 -----
 .../generator/make_code/regular_compound_node.hpp  | 104 ----
 viennacl/generator/meta_tools/typelist.hpp         | 386 --------------
 viennacl/generator/meta_tools/utils.hpp            | 290 ----------
 viennacl/generator/operation_types.hpp             | 130 -----
 .../symbolic_types/convenience_typedef.hpp         | 176 ------
 .../generator/symbolic_types/symbolic_matrix.hpp   | 156 ------
 .../generator/symbolic_types/symbolic_scalars.hpp  | 176 ------
 .../generator/symbolic_types/symbolic_vector.hpp   | 179 -------
 viennacl/generator/tokens_management.hpp           | 107 ----
 .../generator/traits/general_purpose_traits.hpp    | 250 ---------
 viennacl/generator/traits/result_of.hpp            | 591 ---------------------
 viennacl/generator/tree_operations.hpp             | 487 -----------------
 viennacl/linalg/coordinate_matrix_operations.hpp   | 222 --------
 viennacl/linalg/lanczos.hpp~                       | 490 -----------------
 viennacl/tools/matrix_kernel_class_deducer.hpp     |  73 ---
 .../tools/matrix_prod_kernel_class_deducer.hpp     | 171 ------
 122 files changed, 12485 deletions(-)

diff --git a/auxiliary/CMakeLists.txt b/auxiliary/CMakeLists.txt
deleted file mode 100644
index 662eb98..0000000
--- a/auxiliary/CMakeLists.txt
+++ /dev/null
@@ -1,345 +0,0 @@
-include_directories(${Boost_INCLUDE_DIRS})
-
-add_executable(generate-blas3-solve-align1 generate-blas3-solve-align1.cpp)
-add_executable(generate-blas3-prod-align1 generate-blas3-prod-align1.cpp)
-add_executable(generate-blas3-prod16-align1 generate-blas3-prod16-align1.cpp)
-
-function(generate_blas3_prod_align1 outvar)
-   set(crstr_0 col)
-   set(crstr_1 row)
-   set(ATstr_0 A)
-   set(ATstr_1 T)
-   set(outfiles)
-
-   foreach(ar 0 1) # A is column/row major
-   foreach(br 0 1) # B is column/row major
-   foreach(cr 0 1) # C is column/row major
-   foreach(at 0 1) # A is (not) transposed
-   foreach(bt 0 1) # B is (not) transposed
-      set(d "${CMAKE_CURRENT_BINARY_DIR}")
-      set(d "${d}/matrix_prod_${crstr_${ar}}_${crstr_${br}}_${crstr_${cr}}")
-      set(d "${d}/align1")
-      file(MAKE_DIRECTORY "${d}")
-
-      # standard kernels:
-      set(o "${d}/prod_${ATstr_${at}}${ATstr_${bt}}.cl")
-      file(RELATIVE_PATH ro "${CMAKE_CURRENT_BINARY_DIR}" "${o}")
-      add_custom_command(OUTPUT "${o}"
-         COMMAND generate-blas3-prod-align1
-            ${ar} ${br} ${cr} ${at} ${bt} > "${o}"
-         COMMENT "Generating ${ro}"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-         VERBATIM)
-      list(APPEND outfiles "${o}")
-
-      # fast kernels:
-      set(o16 "${d}/prod16_${ATstr_${at}}${ATstr_${bt}}.cl")
-      file(RELATIVE_PATH ro16 "${CMAKE_CURRENT_BINARY_DIR}" "${o16}")
-      add_custom_command(OUTPUT "${o16}"
-         COMMAND generate-blas3-prod16-align1
-            ${ar} ${br} ${cr} ${at} ${bt} > "${o16}"
-         COMMENT "Generating ${ro16}"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-         VERBATIM)
-      list(APPEND outfiles "${o16}")
-
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   set(${outvar} "${outfiles}" PARENT_SCOPE)
-endfunction()
-
-function(generate_blas3_solve_align1 outvar)
-   set(crstr_0 col)
-   set(crstr_1 row)
-   set(tstr_0)
-   set(tstr_1 trans_)
-   set(ulstr_0 lower)
-   set(ulstr_1 upper)
-   set(unitstr_0)
-   set(unitstr_1 unit_)
-   set(outfiles)
-
-   foreach(ar 0 1) # A is column/row major
-   foreach(br 0 1) # A is column/row major
-   foreach(at 0 1) # A is transposed
-   foreach(bt 0 1) # B is transposed
-   foreach(ul 0 1) # upper/lower
-   foreach(un 0 1) # unit
-      set(d "${CMAKE_CURRENT_BINARY_DIR}")
-      set(d "${d}/matrix_solve_${crstr_${ar}}_${crstr_${br}}")
-      set(d "${d}/align1")
-      file(MAKE_DIRECTORY "${d}")
-      set(o "${d}/${tstr_${at}}${unitstr_${un}}${ulstr_${ul}}_${tstr_${bt}}solve.cl")
-      file(RELATIVE_PATH ro "${CMAKE_CURRENT_BINARY_DIR}" "${o}")
-      add_custom_command(OUTPUT "${o}"
-         COMMAND generate-blas3-solve-align1
-            ${ar} ${br} ${at} ${bt} ${ul} ${un} > "${o}"
-         COMMENT "Generating ${ro}"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-         VERBATIM)
-      list(APPEND outfiles "${o}")
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   set(${outvar} "${outfiles}" PARENT_SCOPE)
-endfunction()
-
-# Matrix-Matrix products
-generate_blas3_prod_align1(MATRIX_PROD_SRCS)
-
-# Matrix-Matrix triangular solver
-generate_blas3_solve_align1(MATRIX_SOLVE_SRCS)
-
-set(COMPRESSED_MATRIX_SRCS
-   compressed_matrix/align1/bicgstab_kernel1.cl
-   compressed_matrix/align1/bicgstab_kernel2.cl
-   compressed_matrix/align1/jacobi.cl
-   compressed_matrix/align1/jacobi_precond.cl
-   compressed_matrix/align1/lu_backward.cl
-   compressed_matrix/align1/lu_forward.cl
-   compressed_matrix/align1/row_scaling_1.cl
-   compressed_matrix/align1/row_scaling_2.cl
-   compressed_matrix/align1/vec_mul.cl
-   compressed_matrix/align4/vec_mul.cl
-   compressed_matrix/align8/vec_mul.cl)
-
-set(COORDINATE_MATRIX_SRCS
-   coordinate_matrix/align1/vec_mul.cl
-   coordinate_matrix/align128/dummy)
-
-set(ELL_MATRIX_SRCS
-   ell_matrix/align1/vec_mul.cl)
-
-set(HYB_MATRIX_SRCS
-   hyb_matrix/align1/vec_mul.cl)
-
-set(MATRIX_COL_SRCS
-   matrix_col/align1/add.cl
-   matrix_col/align1/assign.cl
-   matrix_col/align1/clear.cl
-   matrix_col/align1/cpu_inplace_mult.cl
-   matrix_col/align1/fft_direct.cl
-   matrix_col/align1/fft_radix2.cl
-   matrix_col/align1/fft_radix2_local.cl
-   matrix_col/align1/fft_reorder.cl
-   matrix_col/align1/inplace_add.cl
-   matrix_col/align1/inplace_divide.cl
-   matrix_col/align1/inplace_mult.cl
-   matrix_col/align1/inplace_sub.cl
-   matrix_col/align1/lower_triangular_substitute_inplace.cl
-   matrix_col/align1/lu_factorize.cl
-   matrix_col/align1/rank1_update.cl
-   matrix_col/align1/scaled_rank1_update.cl
-   matrix_col/align1/sub.cl
-   matrix_col/align1/trans_lower_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_unit_lower_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_unit_upper_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_upper_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_vec_mul.cl
-   matrix_col/align1/unit_lower_triangular_substitute_inplace.cl
-   matrix_col/align1/unit_upper_triangular_substitute_inplace.cl
-   matrix_col/align1/upper_triangular_substitute_inplace.cl
-   matrix_col/align1/vec_mul.cl
-   matrix_col/align16/dummy)
-
-set(MATRIX_ROW_SRCS
-   matrix_row/align1/add.cl
-   matrix_row/align1/assign.cl
-   matrix_row/align1/clear.cl
-   matrix_row/align1/cpu_inplace_mult.cl
-   matrix_row/align1/fft_direct.cl
-   matrix_row/align1/fft_radix2.cl
-   matrix_row/align1/fft_radix2_local.cl
-   matrix_row/align1/fft_reorder.cl
-   matrix_row/align1/inplace_add.cl
-   matrix_row/align1/inplace_divide.cl
-   matrix_row/align1/inplace_mult.cl
-   matrix_row/align1/inplace_sub.cl
-   matrix_row/align1/lower_triangular_substitute_inplace.cl
-   matrix_row/align1/lu_factorize.cl
-   matrix_row/align1/rank1_update.cl
-   matrix_row/align1/scaled_rank1_update.cl
-   matrix_row/align1/sub.cl
-   matrix_row/align1/trans_lower_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_unit_lower_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_unit_upper_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_upper_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_vec_mul.cl
-   matrix_row/align1/unit_lower_triangular_substitute_inplace.cl
-   matrix_row/align1/unit_upper_triangular_substitute_inplace.cl
-   matrix_row/align1/upper_triangular_substitute_inplace.cl
-   matrix_row/align1/vec_mul.cl
-   matrix_row/align16/dummy)
-
-set(SCALAR_SRCS
-   scalar/align1/add.cl
-   scalar/align1/cpu_add.cl
-   scalar/align1/cpu_div.cl
-   scalar/align1/cpu_inplace_add.cl
-   scalar/align1/cpu_inplace_div.cl
-   scalar/align1/cpu_inplace_mul.cl
-   scalar/align1/cpu_inplace_sub.cl
-   scalar/align1/cpu_mul.cl
-   scalar/align1/cpu_sub.cl
-   scalar/align1/divide.cl
-   scalar/align1/inplace_add.cl
-   scalar/align1/inplace_div.cl
-   scalar/align1/inplace_mul.cl
-   scalar/align1/inplace_sub.cl
-   scalar/align1/mul.cl
-   scalar/align1/sub.cl)
-
-set(VECTOR_SRCS
-   vector/align16/add.cl
-   vector/align16/cpu_inplace_mul.cl
-   vector/align16/cpu_mult.cl
-   vector/align16/divide.cl
-   vector/align16/inplace_add.cl
-   vector/align16/inplace_divide.cl
-   vector/align16/inplace_mult.cl
-   vector/align16/inplace_sub.cl
-   vector/align16/mult.cl
-   vector/align16/sub.cl
-   vector/align1/add.cl
-   vector/align1/assign.cl
-   vector/align1/clear.cl
-   vector/align1/cpu_inplace_mul_add.cl
-   vector/align1/cpu_inplace_mult.cl
-   vector/align1/cpu_mul_add.cl
-   vector/align1/cpu_mult.cl
-   vector/align1/diag_precond.cl
-   vector/align1/divide.cl
-   vector/align1/index_norm_inf.cl
-   vector/align1/inner_prod.cl
-   vector/align1/inplace_add.cl
-   vector/align1/inplace_div_add.cl
-   vector/align1/inplace_divide.cl
-   vector/align1/inplace_div_sub.cl
-   vector/align1/inplace_mul_add.cl
-   vector/align1/inplace_mul_sub.cl
-   vector/align1/inplace_mult.cl
-   vector/align1/inplace_sub.cl
-   vector/align1/mul_add.cl
-   vector/align1/mul_sub.cl
-   vector/align1/mult.cl
-   vector/align1/norm_1.cl
-   vector/align1/norm_2.cl
-   vector/align1/norm_inf.cl
-   vector/align1/plane_rotation.cl
-   vector/align1/sqrt_sum.cl
-   vector/align1/sub.cl
-   vector/align1/sum.cl
-   vector/align1/swap.cl
-   vector/align1/vmax.cl
-   vector/align4/cpu_inplace_mul_add.cl
-   vector/align4/cpu_mul_add.cl
-   vector/align4/inplace_div_add.cl
-   vector/align4/inplace_div_sub.cl
-   vector/align4/inplace_mul_add.cl
-   vector/align4/inplace_mul_sub.cl
-   vector/align4/mul_add.cl)
-
-set(FFT_SRCS
-   fft/align1/bluestein_post.cl
-   fft/align1/bluestein_pre.cl
-   fft/align1/complex_to_real.cl
-   fft/align1/fft_div_vec_scalar.cl
-   fft/align1/fft_mult_vec.cl
-   fft/align1/real_to_complex.cl
-   fft/align1/reverse_inplace.cl
-   fft/align1/transpose.cl
-   fft/align1/transpose_inplace.cl
-   fft/align1/vandermonde_prod.cl
-   fft/align1/zero2.cl
-   )
-
-set(SVD_SRCS
-   svd/align1/copy_col.cl
-   svd/align1/copy_row.cl
-   svd/align1/transpose_inplace.cl
-   svd/align1/inverse_signs.cl
-   svd/align1/givens_prev.cl
-   svd/align1/bidiag_pack.cl
-   svd/align1/house_col.cl
-   svd/align1/house_row.cl
-   )
-
-set(SPAI_SRCS
-   spai/align1/assemble_blocks.cl
-   spai/align1/block_bv_assembly.cl
-   spai/align1/block_least_squares.cl
-   spai/align1/block_q_mult.cl
-   spai/align1/block_qr.cl
-   spai/align1/block_qr_assembly.cl
-   spai/align1/block_qr_assembly_1.cl
-   spai/align1/block_r_assembly.cl
-   )
-
-set(NMF_SRCS
-   nmf/align1/el_wise_mul_div.cl
-   nmf/align1/sub_wise.cl
-   )
-
-set(CL_SRCS)
-foreach(f IN LISTS COMPRESSED_MATRIX_SRCS COORDINATE_MATRIX_SRCS ELL_MATRIX_SRCS HYB_MATRIX_SRCS
-      MATRIX_COL_SRCS MATRIX_ROW_SRCS SCALAR_SRCS VECTOR_SRCS FFT_SRCS SVD_SRCS SPAI_SRCS NMF_SRCS)
-   get_filename_component(d "${CMAKE_CURRENT_BINARY_DIR}/${f}" PATH)
-   file(MAKE_DIRECTORY "${d}")
-   configure_file(${f} "${CMAKE_CURRENT_BINARY_DIR}/${f}" COPYONLY)
-   list(APPEND CL_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${f}")
-endforeach()
-list(APPEND CL_SRCS ${MATRIX_PROD_SRCS} ${MATRIX_SOLVE_SRCS})
-
-add_executable(converter converter.cpp)
-target_link_libraries(converter ${Boost_LIBRARIES})
-
-set(KERNEL_HDRS)
-set(KERNEL_SRCS)
-foreach(d
-      compressed_matrix
-      coordinate_matrix
-      ell_matrix
-      hyb_matrix
-      matrix_col
-      matrix_prod_col_col_col
-      matrix_prod_col_col_row
-      matrix_prod_col_row_col
-      matrix_prod_col_row_row
-      matrix_prod_row_col_col
-      matrix_prod_row_col_row
-      matrix_prod_row_row_col
-      matrix_prod_row_row_row
-      matrix_row
-      matrix_solve_col_col
-      matrix_solve_col_row
-      matrix_solve_row_col
-      matrix_solve_row_row
-      scalar
-      vector
-      fft
-      svd
-      spai
-      nmf
-      )
-   set(f "${PROJECT_SOURCE_DIR}/viennacl/linalg/kernels/${d}")
-   list(APPEND KERNEL_HDRS "${f}_kernels.h")
-   list(APPEND KERNEL_SRCS "${f}_source.h")
-endforeach()
-
-file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/viennacl/linalg/kernels")
-
-add_custom_command(OUTPUT ${KERNEL_HDRS} ${KERNEL_SRCS}
-   COMMAND converter
-   DEPENDS ${CL_SRCS}
-   WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-   COMMENT "Generating kernel headers and sources"
-   VERBATIM)
-
-add_custom_target(kernels ALL
-   DEPENDS ${KERNEL_HDRS} ${KERNEL_SRCS})
diff --git a/auxiliary/converter.cpp b/auxiliary/converter.cpp
deleted file mode 100644
index 03f624c..0000000
--- a/auxiliary/converter.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
-* Converts OpenCL sources to header file string constants
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-
-#define BOOST_FILESYSTEM_VERSION 2
-
-#include <boost/filesystem/operations.hpp>
-#include <boost/filesystem/path.hpp>
-#include <iostream>
-
-namespace fs = boost::filesystem;
-
-void writeSourceFile(std::ofstream & out_file, std::string & filename, const char * dirname, const char * alignment)
-{
-    std::string fullpath(dirname);
-    fullpath += "/";
-    fullpath += alignment;
-    fullpath += "/";
-    fullpath += filename;
-    std::ifstream in_file(fullpath.c_str());
-    std::string tmp;
-
-    if (in_file.is_open())
-    {
-        //write variable declaration:
-        out_file << "const char * const " << dirname << "_" << alignment << "_" << filename.substr(0, filename.size()-3) << " = " << std::endl;
-    
-        //write source string:
-        while (getline(in_file, tmp, '\n'))
-        {
-            if (tmp.size() > 0)
-            {
-      	        //out_file << "\"" << tmp.replace(tmp.end()-1, tmp.end(), "\\n\"") << std::endl;
-                if ( *(tmp.end()-1) == '\r')  //Windows line delimiter, \r\n
-                    out_file << "\"" << tmp.replace(tmp.end()-1, tmp.end(), "\\n\"") << std::endl;
-                else //Unix line delimiter \n
-                    out_file << "\"" << tmp.append("\\n\"") << std::endl;
-            }
-        }
-        out_file << "; //" << dirname << "_" << alignment << "_" << filename.substr(0, filename.size()-3)  << std::endl << std::endl;
-        
-    }
-    else
-        std::cerr << "Failed to open file " << filename << std::endl;
-}
-
-void createSourceFile(const char * dirname)
-{
-    //Step 1: Open source file
-    std::string header_name(dirname);
-    std::ofstream source_file(("../../viennacl/linalg/kernels/" + header_name + "_source.h").c_str());
-
-    //Step 2: Write source header file preamble
-    std::string dirname_uppercase(dirname);
-    std::transform(dirname_uppercase.begin(), dirname_uppercase.end(), dirname_uppercase.begin(), toupper);
-    source_file << "#ifndef VIENNACL_LINALG_KERNELS_" << dirname_uppercase << "_SOURCE_HPP_" << std::endl;
-    source_file << "#define VIENNACL_LINALG_KERNELS_" << dirname_uppercase << "_SOURCE_HPP_" << std::endl;
-    source_file << "//Automatically generated file from auxiliary-directory, do not edit manually!" << std::endl;
-    source_file << "namespace viennacl" << std::endl;
-    source_file << "{" << std::endl;
-    source_file << " namespace linalg" << std::endl;
-    source_file << " {" << std::endl;
-    source_file << "  namespace kernels" << std::endl;
-    source_file << "  {" << std::endl;
-
-    //Step 3: Write all OpenCL kernel sources into header file
-    fs::path filepath = fs::system_complete( fs::path( dirname ) );
-    if ( fs::is_directory( filepath ) )
-    {
-        //std::cout << "\n In directory " << filepath.directory_string() << std::endl;
-
-        fs::directory_iterator end_iter;
-        //write and register single precision sources:
-        for ( fs::directory_iterator alignment_itr( filepath );
-              alignment_itr != end_iter;
-              ++alignment_itr )
-        {
-            if (fs::is_directory( alignment_itr->path() ))
-            {
-                std::cout << "\nGenerating kernels from directory " << alignment_itr->path().directory_string() << std::endl;
-
-                //write and register single precision sources:
-                for ( fs::directory_iterator cl_itr( alignment_itr->path() );
-                      cl_itr != end_iter;
-                      ++cl_itr )
-                {
-                    std::string fname = cl_itr->path().filename();
-                    std::string alignment = alignment_itr->path().filename();
-
-                    size_t pos = fname.find(".cl");
-                    if ( pos == std::string::npos )
-                      continue;
-
-                    if (fname.substr(fname.size()-3, 3) == ".cl")
-                        writeSourceFile(source_file, fname, dirname, alignment.c_str());
-                        //std::cout << alignment_itr->path().filename() << "/" << fname << std::endl;
-                } //for                
-            } //if is_directory
-        } //for alignment_iterator
-    } //if is_directory
-    else
-        std::cerr << "Cannot access directory " << dirname << std::endl;
-
-    //Final Step: Write file tail:
-    source_file << "  }  //namespace kernels" << std::endl;
-    source_file << " }  //namespace linalg" << std::endl;
-    source_file << "}  //namespace viennacl" << std::endl;
-    source_file << "#endif" << std::endl;
-    source_file.close();
-}
-
-
-unsigned int getBestKernel(const char * dirname, std::string & kernel_name, unsigned int alignment)
-{
-    unsigned int search_alignment = alignment;
-    //std::cout << "Searching for best match for " << kernel_name << " with alignment " << alignment << std::endl;
-
-    while (search_alignment > 1)
-    {
-        std::ostringstream oss;
-        oss << dirname << "/align" << search_alignment;
-        //std::cout << "Searching " << oss.str() << std::endl;
-
-        //try to find kernel in directory:
-        fs::path filepath = fs::system_complete( fs::path( oss.str() ) );
-        if ( fs::is_directory( filepath ) ) //directory exists?
-        {
-            fs::directory_iterator end_iter;
-            for ( fs::directory_iterator cl_itr( filepath );
-                  cl_itr != end_iter;
-                  ++cl_itr )
-            {
-                std::string fname = cl_itr->path().filename();
-                if (fname == kernel_name)
-                {
-                  //std::cout << "Found matching kernel for " << kernel_name << " with alignment " << alignment << " at alignment " << search_alignment << std::endl;
-                    return search_alignment;
-                }
-            }
-        }
-
-        search_alignment /= 2;
-    }
-
-    //std::cout << "Found alignment 1 only..." << std::endl;
-    //nothing found: return alignment 1:
-    return 1;
-}
-
-
-void writeKernelInit(std::ostream & kernel_file, const char * dirname, std::string & subfolder, bool is_float)
-{
-    //extract alignment information from subfolder string:
-    std::istringstream stream(subfolder.substr(5, subfolder.size()-5));
-    unsigned int alignment = 0;
-    stream >> alignment;
-    if (alignment == 0)
-        std::cerr << "ERROR: Could not extract alignment from " << subfolder << std::endl;
-
-    kernel_file << "   template <>" << std::endl;
-    kernel_file << "   struct " << dirname;
-    if (is_float)
-        kernel_file << "<float, ";
-    else
-        kernel_file << "<double, ";
-    kernel_file << alignment << ">" << std::endl;
-    kernel_file << "   {" << std::endl;
-
-    kernel_file << "    static std::string program_name()" << std::endl;
-    kernel_file << "    {" << std::endl;
-    kernel_file << "      return \"";
-    if (is_float)
-        kernel_file << "f";
-    else
-        kernel_file << "d";
-    kernel_file << "_" << dirname << "_" << alignment << "\";" << std::endl;
-    kernel_file << "    }" << std::endl;
-    
-    kernel_file << "    static void init()" << std::endl;
-    kernel_file << "    {" << std::endl;
-    if (is_float)
-      kernel_file << "      viennacl::ocl::DOUBLE_PRECISION_CHECKER<float>::apply();" << std::endl;
-    else
-      kernel_file << "      viennacl::ocl::DOUBLE_PRECISION_CHECKER<double>::apply();" << std::endl;
-    kernel_file << "      static std::map<cl_context, bool> init_done;" << std::endl;
-    kernel_file << "      viennacl::ocl::context & context_ = viennacl::ocl::current_context();" << std::endl;
-    kernel_file << "      if (!init_done[context_.handle().get()])" << std::endl;
-    kernel_file << "      {" << std::endl;
-    kernel_file << "        std::string source;" << std::endl;
-    if (!is_float)
-      kernel_file << "        std::string fp64_ext = viennacl::ocl::current_device().double_support_extension();" << std::endl;
-
-    //iterate over all kernels in align1-folder:
-    std::string current_dir(dirname);
-    current_dir += "/align1";
-    fs::path filepath = fs::system_complete( fs::path( current_dir ) );
-
-    fs::directory_iterator end_iter;
-    //write and register single precision sources:
-    for ( fs::directory_iterator cl_itr( filepath );
-          cl_itr != end_iter;
-          ++cl_itr )
-    {
-        std::string fname = cl_itr->path().filename();
-        size_t pos = fname.find(".cl");
-        if ( pos == std::string::npos )
-          continue;
-
-        if (fname.substr(fname.size()-3, 3) == ".cl")
-        {
-            //add kernel source to program string:
-            kernel_file << "        source.append(";
-            if (!is_float)
-                kernel_file << "viennacl::tools::make_double_kernel(";
-            kernel_file << dirname << "_align" << getBestKernel(dirname, fname, alignment) << "_" << fname.substr(0, fname.size()-3);
-            if (!is_float)
-                kernel_file << ", fp64_ext)";
-            kernel_file << ");" << std::endl;
-        }
-    } //for                
-    
-    kernel_file << "        std::string prog_name = program_name();" << std::endl;
-    kernel_file << "        #ifdef VIENNACL_BUILD_INFO" << std::endl;
-    kernel_file << "        std::cout << \"Creating program \" << prog_name << std::endl;" << std::endl;
-    kernel_file << "        #endif" << std::endl;
-    kernel_file << "        context_.add_program(source, prog_name);" << std::endl;
-    kernel_file << "        viennacl::ocl::program & prog_ = context_.get_program(prog_name);" << std::endl;
-    
-    //write and register single precision sources:
-    for ( fs::directory_iterator cl_itr( filepath );
-          cl_itr != end_iter;
-          ++cl_itr )
-    {
-        std::string fname = cl_itr->path().filename();
-        size_t pos = fname.find(".cl");
-        if ( pos == std::string::npos )
-          continue;
-
-        if (fname.substr(fname.size()-3, 3) == ".cl")
-        {
-            //initialize kernel:
-            kernel_file << "        prog_.add_kernel(\"" << fname.substr(0, fname.size()-3) << "\");" << std::endl;
-        }
-    } //for                
-    
-    kernel_file << "        init_done[context_.handle().get()] = true;" << std::endl;
-    kernel_file << "       } //if" << std::endl;
-    kernel_file << "     } //init" << std::endl;
-    kernel_file << "    }; // struct" << std::endl << std::endl;
-}
-
-
-
-
-void createKernelFile(const char * dirname)
-{
-    //Step 1: Open kernel file
-    std::string header_name(dirname);
-    std::ofstream kernel_file(("../../viennacl/linalg/kernels/" + header_name + "_kernels.h").c_str());
-
-    //Step 2: Write kernel header file preamble
-    std::string dirname_uppercase(dirname);
-    std::transform(dirname_uppercase.begin(), dirname_uppercase.end(), dirname_uppercase.begin(), toupper);
-    kernel_file << "#ifndef _VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl;
-    kernel_file << "#define _VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl;
-    kernel_file << "#include \"viennacl/tools/tools.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/ocl/kernel.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/ocl/platform.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/ocl/utils.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/linalg/kernels/" << dirname << "_source.h\"" << std::endl;
-    kernel_file << std::endl;
-    kernel_file << "//Automatically generated file from aux-directory, do not edit manually!" << std::endl;
-    kernel_file << "namespace viennacl" << std::endl;
-    kernel_file << "{" << std::endl;
-    kernel_file << " namespace linalg" << std::endl;
-    kernel_file << " {" << std::endl;
-    kernel_file << "  namespace kernels" << std::endl;
-    kernel_file << "  {" << std::endl;
-
-    //Step 3: Write class information:
-    kernel_file << "   template<class TYPE, unsigned int alignment>" << std::endl;
-    kernel_file << "   struct " << dirname << ";" << std::endl << std::endl;
-    
-    //Step 4: Write single precision kernels
-    std::string dir(dirname);
-    kernel_file << std::endl << "    /////////////// single precision kernels //////////////// " << std::endl;
-    fs::path filepath = fs::system_complete( fs::path( dir ) );
-    if ( fs::is_directory( filepath ) )
-    {
-        //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl;
-
-        fs::directory_iterator end_iter;
-        //write and register single precision sources:
-        for ( fs::directory_iterator alignment_itr( filepath );
-              alignment_itr != end_iter;
-              ++alignment_itr )
-        {
-            if (fs::is_directory( alignment_itr->path() ))
-            {
-                std::string subfolder = alignment_itr->path().filename();
-                if( subfolder.find("align") == std::string::npos )
-                  continue;
-                writeKernelInit(kernel_file, dirname, subfolder, true);
-            } //if is_directory
-        } //for alignment_iterator
-        kernel_file << std::endl;
-    } //if is_directory
-    else
-        std::cerr << "Cannot access directory " << dirname << std::endl;
-
-    //Step 5: Write double precision kernels
-    kernel_file << std::endl << "    /////////////// double precision kernels //////////////// " << std::endl;
-    filepath = fs::system_complete( fs::path( dir ) );
-    if ( fs::is_directory( filepath ) )
-    {
-        //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl;
-
-        fs::directory_iterator end_iter;
-        //write and register single precision sources:
-        for ( fs::directory_iterator alignment_itr( filepath );
-              alignment_itr != end_iter;
-              ++alignment_itr )
-        {
-            if (fs::is_directory( alignment_itr->path() ))
-            {
-                std::string subfolder = alignment_itr->path().filename();
-                if( subfolder.find("align") == std::string::npos )
-                  continue;
-                writeKernelInit(kernel_file, dirname, subfolder, false);
-            } //if is_directory
-        } //for alignment_iterator
-        kernel_file << std::endl;
-    } //if is_directory
-    else
-        std::cerr << "Cannot access directory " << dirname << std::endl;
-
-    //Final Step: Write file tail:
-    kernel_file << "  }  //namespace kernels" << std::endl;
-    kernel_file << " }  //namespace linalg" << std::endl;
-    kernel_file << "}  //namespace viennacl" << std::endl;
-    kernel_file << "#endif" << std::endl;
-    kernel_file.close();
-}
-
-void createHeaders(const char * dirname)
-{
-    createKernelFile(dirname);
-    createSourceFile(dirname);
-}
-
-int main(int args, char * argsv[])
-{
-    createHeaders("compressed_matrix");
-    createHeaders("coordinate_matrix");
-    createHeaders("ell_matrix");
-    createHeaders("hyb_matrix");
-    createHeaders("matrix_row");
-    createHeaders("matrix_col");
-    createHeaders("matrix_prod_row_row_row");
-    createHeaders("matrix_prod_row_row_col");
-    createHeaders("matrix_prod_row_col_row");
-    createHeaders("matrix_prod_row_col_col");
-    createHeaders("matrix_prod_col_row_row");
-    createHeaders("matrix_prod_col_row_col");
-    createHeaders("matrix_prod_col_col_row");
-    createHeaders("matrix_prod_col_col_col");
-    createHeaders("matrix_solve_col_col");
-    createHeaders("matrix_solve_col_row");
-    createHeaders("matrix_solve_row_col");
-    createHeaders("matrix_solve_row_row");
-    createHeaders("scalar");
-    createHeaders("vector");
-    createHeaders("fft");
-    createHeaders("svd");
-    createHeaders("spai");
-    createHeaders("nmf");
-}
-
diff --git a/auxiliary/ell_matrix/align1/vec_mul.cl b/auxiliary/ell_matrix/align1/vec_mul.cl
deleted file mode 100644
index ba427bf..0000000
--- a/auxiliary/ell_matrix/align1/vec_mul.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
-__kernel void vec_mul(
-    const __global int* coords,
-    const __global float* elements,
-    const __global const float * vector,
-    __global float * result,
-    const unsigned int row_num,
-    const unsigned int col_num,
-    const unsigned int internal_row_num,
-    const unsigned int items_per_row,
-    const unsigned int aligned_items_per_row
-    )
-{
-    uint glb_id = get_global_id(0);
-    uint glb_sz = get_global_size(0);
-
-    for(uint row_id = glb_id; row_id < row_num; row_id += glb_sz)
-    {
-        float sum = 0;
-        
-        uint offset = row_id;
-        for(uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
-        {
-            float val = elements[offset];
-
-
-            if(val != 0.0f)
-            {
-                int col = coords[offset];    
-                sum += (vector[col] * val);
-            }
-            
-        }
-
-        result[row_id] = sum;
-    }
-}
\ No newline at end of file
diff --git a/auxiliary/generate-blas3-prod-align1.cpp b/auxiliary/generate-blas3-prod-align1.cpp
deleted file mode 100755
index c15d89f..0000000
--- a/auxiliary/generate-blas3-prod-align1.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
-* Generates BLAS level 3 routines
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-
-#include <iostream>
-#include <stdlib.h>
-
-//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
-void printMatrixMatrixProduct(bool row_major_A, bool row_major_B, bool row_major_C,
-                              bool transpose_A, bool transpose_B)
-{
-  //write header:
-  std::cout << "// file automatically generated - do not edit!" << std::endl;
-  std::cout << "// matrix-matrix multiplication C = ";
-  if (transpose_A)
-    std::cout << "A^T * ";
-  else
-    std::cout << "A * ";
-  if (transpose_B)
-    std::cout << "B^T" << std::endl;
-  else
-    std::cout << "B" << std::endl;
-  std::cout << "// matrix layouts: ";
-  if (row_major_C)
-    std::cout << "C...row_major, ";
-  else
-    std::cout << "C...col_major, ";
-  if (row_major_A)
-    std::cout << "A...row_major, ";
-  else
-    std::cout << "A...col_major, ";
-  if (row_major_B)
-    std::cout << "B...row_major" << std::endl;
-  else
-    std::cout << "B...col_major" << std::endl;
-  
-  //start OpenCL code:
-  std::cout << "__kernel void prod_";
-  if (transpose_A)
-    std::cout << "T";
-  else
-    std::cout << "A";
-  if (transpose_B)
-    std::cout << "T";
-  else
-    std::cout << "A";
-  
-  std::cout << "(" << std::endl;
-  std::cout << "          float alpha," << std::endl;
-  std::cout << "          __global const float * A," << std::endl;
-  std::cout << "          unsigned int A_row_start," << std::endl;
-  std::cout << "          unsigned int A_col_start," << std::endl;
-  std::cout << "          unsigned int A_row_inc," << std::endl;
-  std::cout << "          unsigned int A_col_inc," << std::endl;
-  std::cout << "          unsigned int A_row_size," << std::endl;   //number of elements starting from row_start!
-  std::cout << "          unsigned int A_col_size," << std::endl;
-  std::cout << "          unsigned int A_internal_rows," << std::endl;
-  std::cout << "          unsigned int A_internal_cols," << std::endl;
-  std::cout << "          __global const float * B,  " << std::endl;
-  std::cout << "          unsigned int B_row_start," << std::endl;
-  std::cout << "          unsigned int B_col_start," << std::endl;
-  std::cout << "          unsigned int B_row_inc," << std::endl;
-  std::cout << "          unsigned int B_col_inc," << std::endl;
-  std::cout << "          unsigned int B_row_size," << std::endl;
-  std::cout << "          unsigned int B_col_size," << std::endl;
-  std::cout << "          unsigned int B_internal_rows," << std::endl;
-  std::cout << "          unsigned int B_internal_cols," << std::endl;
-  std::cout << "          float beta," << std::endl;
-  std::cout << "          __global float * C," << std::endl;
-  std::cout << "          unsigned int C_row_start," << std::endl;
-  std::cout << "          unsigned int C_col_start," << std::endl;
-  std::cout << "          unsigned int C_row_inc," << std::endl;
-  std::cout << "          unsigned int C_col_inc," << std::endl;
-  std::cout << "          unsigned int C_row_size," << std::endl;
-  std::cout << "          unsigned int C_col_size," << std::endl;
-  std::cout << "          unsigned int C_internal_rows," << std::endl;
-  std::cout << "          unsigned int C_internal_cols) " << std::endl;
-  std::cout << "{ " << std::endl;
-  std::cout << std::endl;
-  std::cout << "  __local float bufA[" << 16 * 17 << "];" << std::endl;
-  std::cout << "  __local float bufB[" << 16 * 17 << "];" << std::endl;
-  std::cout << std::endl;
-  //do not forgot to change block_size !!!
-  std::cout << "  size_t block_size = 16;//get_local_size(0);" << std::endl;
-  std::cout << "  size_t row_block_id = get_group_id(0);" << std::endl;
-  std::cout << "  size_t col_block_id = get_group_id(1);" << std::endl;
-  std::cout << "  size_t row_thread_id = get_local_id(0);" << std::endl;
-  std::cout << "  size_t col_thread_id = get_local_id(1);" << std::endl;
-  
-  //traverse block row of A (taking mem layout and transpose operation into account)
-  if (row_major_A && transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;" << std::endl;
-    std::cout << "  size_t aStep = block_size * A_row_inc * A_internal_cols;" << std::endl;
-  }
-  else if (row_major_A && !transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;" << std::endl;
-    std::cout << "  size_t aStep = block_size * A_col_inc;" << std::endl;
-  }
-  else if (!row_major_A && transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;" << std::endl;
-    std::cout << "  size_t aStep = block_size * A_row_inc;" << std::endl;
-  }
-  else if (!row_major_A && !transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;" << std::endl;
-    std::cout << "  size_t aStep = block_size * A_col_inc * A_internal_rows;" << std::endl;
-  }
-
-
-  if (row_major_B && transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;" << std::endl;
-    std::cout << "  size_t bStep = block_size * B_col_inc;" << std::endl;
-  }
-  else if (row_major_B && !transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;" << std::endl;
-    std::cout << "  size_t bStep = block_size * B_internal_cols * B_row_inc;" << std::endl;
-  }
-  else if (!row_major_B && transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;" << std::endl;
-    std::cout << "  size_t bStep = block_size * B_internal_rows * B_col_inc;" << std::endl;
-  }
-  else if (!row_major_B && !transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;" << std::endl;
-    std::cout << "  size_t bStep = block_size * B_row_inc;" << std::endl;
-  }
-
-
-  if (transpose_A)
-    std::cout << "  size_t block_num = (A_row_size + block_size - 1) / block_size;" << std::endl;
-  else
-    std::cout << "  size_t block_num = (A_col_size + block_size - 1) / block_size;" << std::endl;
-    
-  std::cout << "  float Csub = 0;" << std::endl;
-  
-  //offset of the the memory access by the thread relative to the beginning of the block:
-  if (row_major_A)
-    std::cout << "  size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_cols;" << std::endl;
-  else
-    std::cout << "  size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;" << std::endl;
-
-  if (row_major_B)
-    std::cout << "  size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_cols;" << std::endl;
-  else
-    std::cout << "  size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;" << std::endl;
-
-  std::cout << std::endl;  
-  
-  std::cout << "  size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);" << std::endl;
-  std::cout << "  size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);" << std::endl;
-
-  std::cout << "  for (size_t block = 0;" << std::endl;
-  std::cout << "           block < block_num;" << std::endl;
-  std::cout << "           ++block)" << std::endl;
-  std::cout << "  {" << std::endl;
-  
-  //read block from A and check for access within matrix:
-/*  if (transpose_A)
-    std::cout << "    if (block * block_size + col_thread_id < A_rows && get_global_id(0) < A_cols)" << std::endl;
-  else 
-    std::cout << "    if (block * block_size + col_thread_id < A_cols && get_global_id(0) < A_rows)" << std::endl;
-  
-  std::cout << "      bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];" << std::endl;
-  std::cout << "    else" << std::endl;
-  std::cout << "      bufA[row_thread_id * block_size + col_thread_id] = 0;" << std::endl;*/
-
-  if (transpose_A && row_major_A)
-    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else if (transpose_A && !row_major_A)
-    std::cout << "    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else if (!transpose_A && row_major_A)
-    std::cout << "    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else if (!transpose_A && !row_major_A)
-    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-
-
-  if (transpose_B && row_major_B)
-    std::cout << "    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else if (transpose_B && !row_major_B)
-    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else if (!transpose_B && row_major_B)
-    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else if (!transpose_B && !row_major_B)
-    std::cout << "    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-
-  //computation of block-matrix-matrix product is the same for all cases:
-  std::cout << "    barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
-  //std::cout << "    for (size_t k = 0; k < block_size; ++k)" << std::endl;
-  //std::cout << "      Csub += bufA[row_thread_id_times_block_size + k] * bufB[k * block_size + col_thread_id];" << std::endl;
-  //loop unrolling:
-  std::cout << "    __local float * bufAptr = bufA + row_thread_id_times_block_size;" << std::endl;
-  std::cout << "    __local float * bufBptr = bufB + col_thread_id_times_block_size;" << std::endl;
-  //std::cout << "      Csub += bufA[row_thread_id_times_block_size] * bufB[col_thread_id * block_size];" << std::endl;
-  // code in following line depends on block size and must be changed in case of block_size changes
-  std::cout << "      for(int i = 0; i < 4; i++) {" << std::endl;
-  for (size_t unroll = 0; unroll < 4; ++unroll) {
-    std::cout << "      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;" << std::endl;
-  }
-  std::cout << "     }" << std::endl;
-    //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << "  + col_thread_id * block_size];" << std::endl;
-    //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << " * block_size + col_thread_id];" << std::endl;
-    //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << "];" << std::endl;
-  std::cout << "    barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
-  std::cout << "    aBegin += aStep;" << std::endl;
-  std::cout << "    bBegin += bStep;" << std::endl;
-  std::cout << "  }" << std::endl;
-  
-  
-  if (transpose_A)
-    std::cout << "  if (get_global_id(0) < A_col_size && ";
-  else
-    std::cout << "  if (get_global_id(0) < A_row_size && ";
-  
-  if (transpose_B)
-    std::cout << "get_global_id(1) < B_row_size)" << std::endl;
-  else
-    std::cout << "get_global_id(1) < B_col_size)" << std::endl;
-  
-  if (row_major_C)
-    std::cout << "    C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start] = alpha * Csub + beta * C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start];" << std::endl;
-  else
-    std::cout << "    C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows] = alpha * Csub + beta * C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows];" << std::endl;
-  std::cout << "}" << std::endl;
-  
-}
-
-void printUsage()
-{
-  std::cout << "Must have five parameters for C = A * B:" << std::endl;
-  std::cout << " 0/1 : storage layout for A (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for B (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for C (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : transpose for A (no/yes)" << std::endl;
-  std::cout << " 0/1 : transpose for B (no/yes)" << std::endl;
-}
-
-void readParameter(bool & param, char input)
-{
-  if (input == '0')
-    param = false;
-  else if (input == '1')
-    param = true;
-  else
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-}
-
-int main(int args, char * argsv[])
-{
-  if (args != 6)
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-  
-  //the following flags are 'true' for row_major layout
-  bool layout_A;
-  bool layout_B;
-  bool layout_C;
-
-  readParameter(layout_A, argsv[1][0]);
-  readParameter(layout_B, argsv[2][0]);
-  readParameter(layout_C, argsv[3][0]);
-  
-  bool transpose_A;
-  bool transpose_B;
-  readParameter(transpose_A, argsv[4][0]);
-  readParameter(transpose_B, argsv[5][0]);
-  
-  
-  printMatrixMatrixProduct(layout_A, layout_B, layout_C, transpose_A, transpose_B);
-}
diff --git a/auxiliary/generate-blas3-prod16-align1.cpp b/auxiliary/generate-blas3-prod16-align1.cpp
deleted file mode 100644
index 78d21b3..0000000
--- a/auxiliary/generate-blas3-prod16-align1.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
-* Generates BLAS level 3 routines
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-
-#include <iostream>
-#include <stdlib.h>
-
-//generate code for C = alpha * op1(A) * op2(B) + beta * C, where A, B, C can have different storage layouts and opX(D) = D or trans(D)
-void printMatrixMatrixProduct(bool row_major_A, bool row_major_B, bool row_major_C,
-                              bool transpose_A, bool transpose_B)
-{
-  std::size_t vector_size =  4;
-  std::size_t block_size  = 16;
-
-  //write header:
-  std::cout << "// file automatically generated - do not edit!" << std::endl;
-  std::cout << "// matrix-matrix multiplication C = ";
-  if (transpose_A)
-    std::cout << "A^T * ";
-  else
-    std::cout << "A * ";
-  if (transpose_B)
-    std::cout << "B^T" << std::endl;
-  else
-    std::cout << "B" << std::endl;
-  std::cout << "// matrix layouts: ";
-  if (row_major_C)
-    std::cout << "C...row_major, ";
-  else
-    std::cout << "C...col_major, ";
-  if (row_major_A)
-    std::cout << "A...row_major, ";
-  else
-    std::cout << "A...col_major, ";
-  if (row_major_B)
-    std::cout << "B...row_major" << std::endl;
-  else
-    std::cout << "B...col_major" << std::endl;
-  
-  //start OpenCL code:
-  std::cout << "__kernel void prod16_";
-  if (transpose_A)
-    std::cout << "T";
-  else
-    std::cout << "A";
-  if (transpose_B)
-    std::cout << "T";
-  else
-    std::cout << "A";
-  
-  std::cout << "(" << std::endl;
-  std::cout << "          float alpha," << std::endl;
-  std::cout << "          __global const float * A," << std::endl;
-  std::cout << "          unsigned int A_row_start," << std::endl;
-  std::cout << "          unsigned int A_col_start," << std::endl;
-  std::cout << "          unsigned int A_row_inc," << std::endl;
-  std::cout << "          unsigned int A_col_inc," << std::endl;
-  std::cout << "          unsigned int A_row_size," << std::endl;   //number of elements starting from row_start, using an increment of A_row_inc
-  std::cout << "          unsigned int A_col_size," << std::endl;
-  std::cout << "          unsigned int A_internal_rows," << std::endl;
-  std::cout << "          unsigned int A_internal_cols," << std::endl;
-  std::cout << "          __global const float * B,  " << std::endl;
-  std::cout << "          unsigned int B_row_start," << std::endl;
-  std::cout << "          unsigned int B_col_start," << std::endl;
-  std::cout << "          unsigned int B_row_inc," << std::endl;
-  std::cout << "          unsigned int B_col_inc," << std::endl;
-  std::cout << "          unsigned int B_row_size," << std::endl;
-  std::cout << "          unsigned int B_col_size," << std::endl;
-  std::cout << "          unsigned int B_internal_rows," << std::endl;
-  std::cout << "          unsigned int B_internal_cols," << std::endl;
-  std::cout << "          float beta," << std::endl;
-  std::cout << "          __global float * C," << std::endl;
-  std::cout << "          unsigned int C_row_start," << std::endl;
-  std::cout << "          unsigned int C_col_start," << std::endl;
-  std::cout << "          unsigned int C_row_inc," << std::endl;
-  std::cout << "          unsigned int C_col_inc," << std::endl;
-  std::cout << "          unsigned int C_row_size," << std::endl;
-  std::cout << "          unsigned int C_col_size," << std::endl;
-  std::cout << "          unsigned int C_internal_rows," << std::endl;
-  std::cout << "          unsigned int C_internal_cols) " << std::endl;
-  std::cout << "{ " << std::endl;
-  //do not forgot to change block_size !!!
-  std::cout << "  size_t row_block_id = get_group_id(1);" << std::endl;    //refers to the row index in op(A), op(B)
-  std::cout << "  size_t col_block_id = get_group_id(0);" << std::endl;    //refers to the col index in op(A), op(B)
-  std::cout << "  size_t row_thread_id = get_local_id(1);" << std::endl;
-  std::cout << "  size_t col_thread_id = get_local_id(0);" << std::endl;
-  std::cout << std::endl;
-  std::cout << "  __local float As[" << block_size * block_size << "];" << std::endl;
-  std::cout << std::endl;
-  std::cout << "  float cv[" << block_size << "] = {";
-  for (std::size_t i=0; i<block_size-1; ++i)
-    std::cout << "0,";
-  std::cout << "0};" << std::endl;
-
-  //traverse block row of A (taking mem layout and transpose operation into account)
-  if (row_major_A && transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * " << block_size << " * A_col_inc + A_col_start) + A_row_start * A_internal_cols;" << std::endl;
-    std::cout << "  size_t aStep = " << block_size << " * A_internal_cols * A_row_inc;" << std::endl;
-    std::cout << "  size_t aEnd = aBegin + A_internal_cols * A_row_inc * A_row_size;" << std::endl;
-  }
-  else if (row_major_A && !transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * " << block_size << " * A_row_inc + A_row_start) * A_internal_cols + A_col_start;" << std::endl;
-    std::cout << "  size_t aStep = " << block_size << " * A_col_inc;" << std::endl;
-    std::cout << "  size_t aEnd = aBegin + A_col_inc * A_col_size;" << std::endl;
-  }
-  else if (!row_major_A && transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * " << block_size << " * A_col_inc + A_col_start) * A_internal_rows + A_row_start;" << std::endl;
-    std::cout << "  size_t aStep = " << block_size << " * A_row_inc;" << std::endl;
-    std::cout << "  size_t aEnd = aBegin + A_row_inc * A_row_size;" << std::endl;
-  }
-  else if (!row_major_A && !transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * " << block_size << " * A_row_inc + A_row_start) + A_col_start * A_internal_rows;" << std::endl;
-    std::cout << "  size_t aStep = " << block_size << " * A_internal_rows * A_col_inc;" << std::endl;
-    std::cout << "  size_t aEnd = aBegin + A_internal_rows * A_col_inc * A_col_size;" << std::endl;
-  }
-
-
-  if (row_major_B && transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * " << block_size * vector_size << " * B_row_inc + B_row_start) * B_internal_cols + B_col_start;" << std::endl;
-    std::cout << "  size_t bStep = " << block_size << " * B_col_inc;" << std::endl;
-  }
-  else if (row_major_B && !transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * " << block_size * vector_size << " * B_col_inc + B_col_start) + B_row_start * B_internal_cols;" << std::endl;
-    std::cout << "  size_t bStep = " << block_size << " * B_row_inc * B_internal_cols;" << std::endl;
-  }
-  else if (!row_major_B && transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * " << block_size * vector_size << " * B_row_inc + B_row_start) + B_col_start * B_internal_rows;" << std::endl;
-    std::cout << "  size_t bStep = " << block_size << " * B_col_inc * B_internal_rows;" << std::endl;
-  }
-  else if (!row_major_B && !transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * " << block_size * vector_size << " * B_col_inc + B_col_start) * B_internal_rows + B_row_start;" << std::endl;
-    std::cout << "  size_t bStep = " << block_size << " * B_row_inc;" << std::endl;
-  }
-
-  std::cout << "  for(size_t a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep) { " << std::endl;
-  
-  // copy blocks of op(A) to shared memory (op(A) is column-major in shared memory then)
-  std::cout << "    for(size_t i = 0; i < " << vector_size << "; i++)  " << std::endl;
-  if (row_major_A && transpose_A)
-    std::cout << "      As[ (i*" << vector_size << " + row_thread_id) + " << block_size << " * col_thread_id] = (A[a + A_col_inc * (i * " << vector_size << " + row_thread_id) + A_internal_cols * A_row_inc * col_thread_id]);"  << std::endl;
-  else if (row_major_A && !transpose_A)
-    std::cout << "      As[ (i*" << vector_size << " + row_thread_id) + " << block_size << " * col_thread_id] = (A[a + A_internal_cols * A_row_inc * (i * " << vector_size << " + row_thread_id) + A_col_inc * col_thread_id]);"  << std::endl;
-  else if (!row_major_A && transpose_A)
-    std::cout << "      As[ (i*" << vector_size << " + row_thread_id) + " << block_size << " * col_thread_id] = (A[a + A_internal_rows * A_col_inc * (i * " << vector_size << " + row_thread_id) + A_row_inc * col_thread_id]);"  << std::endl;
-  else if (!row_major_A && !transpose_A)
-    std::cout << "      As[ (i*" << vector_size << " + row_thread_id) + " << block_size << " * col_thread_id] = (A[a + A_row_inc * (i * " << vector_size << " + row_thread_id) + A_internal_rows * A_col_inc * col_thread_id]);"  << std::endl;
-  std::cout << std::endl;
-  std::cout << "    barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
-
-  // initialize memory pointers
-  std::cout << std::endl;
-  std::cout << "    __local  float *ap = As; " << std::endl;
-  if (row_major_B && transpose_B)
-    std::cout << "    __global float *bp = B + (b + (" << block_size << " * row_thread_id + col_thread_id) * B_row_inc * B_internal_cols); " << std::endl;
-  else if (row_major_B && !transpose_B)
-    std::cout << "    __global float *bp = B + (b + (" << block_size << " * row_thread_id + col_thread_id) * B_col_inc); " << std::endl;
-  else if (!row_major_B && transpose_B)
-    std::cout << "    __global float *bp = B + (b + (" << block_size << " * row_thread_id + col_thread_id) * B_row_inc); " << std::endl;
-  else if (!row_major_B && !transpose_B)
-    std::cout << "    __global float *bp = B + (b + (" << block_size << " * row_thread_id + col_thread_id) * B_col_inc * B_internal_rows); " << std::endl;
-  std::cout << std::endl;
-
-  // run computations
-  std::cout << "    for(size_t i = 0; i < " << block_size << "; i++) { " << std::endl;
-  if (row_major_B && transpose_B)
-    std::cout << "      float bv = bp[i]; " << std::endl;
-  else if (row_major_B && !transpose_B)
-    std::cout << "      float bv = bp[i * B_internal_cols]; " << std::endl;
-  else if (!row_major_B && transpose_B)
-    std::cout << "      float bv = bp[i * B_internal_rows]; " << std::endl;
-  else if (!row_major_B && !transpose_B)
-    std::cout << "      float bv = bp[i]; " << std::endl;
-  std::cout << std::endl;
-  std::cout << "      for(size_t k = 0; k < " << block_size << "; k++)  " << std::endl;
-  std::cout << "	    cv[k] += ap[k] * bv; " << std::endl;
-  std::cout << std::endl;
-  std::cout << "      ap += " << block_size << "; " << std::endl;
-  std::cout << "    } " << std::endl;
-  std::cout << std::endl;
-  std::cout << "    barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
-  std::cout << "  } " << std::endl;
-
-  // write to C
-  if (row_major_C)
-  {
-      std::cout << "  int c = C_internal_cols * (C_row_inc * " << block_size << " * row_block_id + C_row_start) + "  //block row index
-                              << vector_size * block_size << " * C_col_inc * col_block_id + C_col_start " << std::endl;  //block column index
-      std::cout << "          + C_col_inc * (" << block_size << " * row_thread_id + col_thread_id); " << std::endl;
-  }
-  else
-  {
-      std::cout << "  int c = C_row_inc * " << block_size << " * row_block_id + C_row_start + ("   // block row index
-                              << vector_size * block_size << " * C_col_inc * col_block_id + C_col_start) * C_internal_rows " << std::endl;   // block column index
-      std::cout << "          + C_internal_rows * C_col_inc * (" << block_size << " * row_thread_id + col_thread_id); " << std::endl;
-  }
-
-  std::cout << "  for(size_t i = 0; i < " << block_size << "; i++) { " << std::endl;
-
-  if (row_major_C)
-  {
-    std::cout << "    C[c] = alpha * cv[i] + beta * C[c]; " << std::endl;
-    std::cout << "      c += C_internal_cols * C_row_inc; " << std::endl;
-  }
-  else
-  {
-    std::cout << "    C[c] = alpha * cv[i] + beta * C[c]; " << std::endl;
-    std::cout << "      c += C_row_inc; " << std::endl;
-  }
-
-  std::cout << "  } " << std::endl;
-  std::cout << "} " << std::endl;
-
-
-
-//  if (row_major_C)
-//    std::cout << "    C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start] = Csub;" << std::endl;
-//  else
-//    std::cout << "    C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows] = Csub;" << std::endl;
-  
-}
-
-void printUsage()
-{
-  std::cout << "Must have five parameters for C = A * B:" << std::endl;
-  std::cout << " 0/1 : storage layout for A (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for B (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for C (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : transpose for A (no/yes)" << std::endl;
-  std::cout << " 0/1 : transpose for B (no/yes)" << std::endl;
-}
-
-void readParameter(bool & param, char input)
-{
-  if (input == '0')
-    param = false;
-  else if (input == '1')
-    param = true;
-  else
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-}
-
-int main(int args, char * argsv[])
-{
-  if (args != 6)
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-  
-  //the following flags are 'true' for row_major layout
-  bool layout_A;
-  bool layout_B;
-  bool layout_C;
-
-  readParameter(layout_A, argsv[1][0]);
-  readParameter(layout_B, argsv[2][0]);
-  readParameter(layout_C, argsv[3][0]);
-  
-  bool transpose_A;
-  bool transpose_B;
-  readParameter(transpose_A, argsv[4][0]);
-  readParameter(transpose_B, argsv[5][0]);
-  
-  
-  printMatrixMatrixProduct(layout_A, layout_B, layout_C, transpose_A, transpose_B);
-}
diff --git a/auxiliary/hyb_matrix/align1/vec_mul.cl b/auxiliary/hyb_matrix/align1/vec_mul.cl
deleted file mode 100644
index b921c75..0000000
--- a/auxiliary/hyb_matrix/align1/vec_mul.cl
+++ /dev/null
@@ -1,49 +0,0 @@
-
-__kernel void vec_mul(
-    const __global int* ell_coords,
-    const __global float* ell_elements,
-    const __global uint* csr_rows,
-    const __global uint* csr_cols,
-    const __global float* csr_elements,
-    const __global float * vector,
-
-    __global float * result,
-
-    unsigned int row_num,
-    unsigned int internal_row_num,
-    unsigned int items_per_row,
-    unsigned int aligned_items_per_row
-    )
-{
-    uint glb_id = get_global_id(0);
-    uint glb_sz = get_global_size(0);
-
-    for(uint row_id = glb_id; row_id < row_num; row_id += glb_sz)
-    {
-        float sum = 0;
-        
-        uint offset = row_id;
-        for(uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
-        {
-            float val = ell_elements[offset];
-
-
-            if(val != 0.0f)
-            {
-                int col = ell_coords[offset];    
-                sum += (vector[col] * val);
-            }
-            
-        }
-
-        uint col_begin = csr_rows[row_id];
-        uint col_end   = csr_rows[row_id + 1];
-
-        for(uint item_id = col_begin; item_id < col_end; item_id++)
-        {
-            sum += (vector[csr_cols[item_id]] * csr_elements[item_id]);
-        }
-
-        result[row_id] = sum;
-    }
-}
\ No newline at end of file
diff --git a/auxiliary/matrix_col/align1/add.cl b/auxiliary/matrix_col/align1/add.cl
deleted file mode 100644
index 89ed2ee..0000000
--- a/auxiliary/matrix_col/align1/add.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-__kernel void add(  // C = A + B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[i * C_row_inc + C_row_start + (j* C_col_inc + C_col_start) * C_internal_rows] =
-        A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows]
-        + B[i * B_row_inc + B_row_start + (j * B_col_inc + B_col_start) * B_internal_rows];
-}
diff --git a/auxiliary/matrix_col/align1/assign.cl b/auxiliary/matrix_col/align1/assign.cl
deleted file mode 100644
index c586785..0000000
--- a/auxiliary/matrix_col/align1/assign.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-__kernel void assign( // A <- B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] = 
-        B[i * B_row_inc + B_row_start + (j * B_col_inc + B_col_start) * B_internal_rows];
-}
-
diff --git a/auxiliary/matrix_col/align1/clear.cl b/auxiliary/matrix_col/align1/clear.cl
deleted file mode 100644
index 90f51e4..0000000
--- a/auxiliary/matrix_col/align1/clear.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void clear( // A <- 0
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] = 0;
-}
diff --git a/auxiliary/matrix_col/align1/cpu_inplace_mult.cl b/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
deleted file mode 100644
index f259062..0000000
--- a/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void cpu_inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] *= factor;
-}
-
diff --git a/auxiliary/matrix_col/align1/inplace_add.cl b/auxiliary/matrix_col/align1/inplace_add.cl
deleted file mode 100644
index d4a3933..0000000
--- a/auxiliary/matrix_col/align1/inplace_add.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-__kernel void inplace_add( // A += B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] +=
-        B[i * B_row_inc + B_row_start + (j * B_col_inc + B_col_start) * B_internal_rows];
-}
diff --git a/auxiliary/matrix_col/align1/inplace_divide.cl b/auxiliary/matrix_col/align1/inplace_divide.cl
deleted file mode 100644
index 452d487..0000000
--- a/auxiliary/matrix_col/align1/inplace_divide.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-
-__kernel void inplace_divide( // A /= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] /= factor;
-}
-
diff --git a/auxiliary/matrix_col/align1/inplace_mult.cl b/auxiliary/matrix_col/align1/inplace_mult.cl
deleted file mode 100644
index 04555aa..0000000
--- a/auxiliary/matrix_col/align1/inplace_mult.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-__kernel void inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] *= factor;
-}
-
-
diff --git a/auxiliary/matrix_col/align1/inplace_sub.cl b/auxiliary/matrix_col/align1/inplace_sub.cl
deleted file mode 100644
index 5f02bcb..0000000
--- a/auxiliary/matrix_col/align1/inplace_sub.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-__kernel void inplace_sub( // A -= B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows] -=
-        B[i * B_row_inc + B_row_start + (j * B_col_inc + B_col_start) * B_internal_rows];
-}
-
diff --git a/auxiliary/matrix_col/align1/sub.cl b/auxiliary/matrix_col/align1/sub.cl
deleted file mode 100644
index 7279ad7..0000000
--- a/auxiliary/matrix_col/align1/sub.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-__kernel void sub(  // C = A - B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[i * C_row_inc + C_row_start + (j * C_col_inc + C_col_start) * C_internal_rows] =
-        A[i * A_row_inc + A_row_start + (j * A_col_inc + A_col_start) * A_internal_rows]
-         - B[i * B_row_inc + B_row_start + (j * B_col_inc + B_col_start) * B_internal_rows];
-}
diff --git a/auxiliary/matrix_col/align1/trans_vec_mul.cl b/auxiliary/matrix_col/align1/trans_vec_mul.cl
deleted file mode 100644
index 17fbef8..0000000
--- a/auxiliary/matrix_col/align1/trans_vec_mul.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-
-__kernel void trans_vec_mul(
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          __global float * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0))
-  {
-    float dot_prod = 0;
-    for (unsigned int col = 0; col < A_row_size; ++col)
-      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + col * v_inc];
-    result[row * result_inc + result_start] = dot_prod;
-  }
-}
diff --git a/auxiliary/matrix_col/align1/vec_mul.cl b/auxiliary/matrix_col/align1/vec_mul.cl
deleted file mode 100644
index ea722c7..0000000
--- a/auxiliary/matrix_col/align1/vec_mul.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-
-__kernel void vec_mul(
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          __global float * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0))
-  {
-    float dot_prod = 0;
-    for (unsigned int col = 0; col < A_col_size; ++col)
-      dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
-    result[row * result_inc + result_start] = dot_prod;
-  }
-}
diff --git a/auxiliary/matrix_row/align1/add.cl b/auxiliary/matrix_row/align1/add.cl
deleted file mode 100644
index ada7283..0000000
--- a/auxiliary/matrix_row/align1/add.cl
+++ /dev/null
@@ -1,37 +0,0 @@
-
-__kernel void add(  // C = A + B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[(i * C_row_inc + C_row_start) * C_internal_cols + j * C_col_inc + C_col_start] = 
-        A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start]
-        + B[(i * B_row_inc + B_row_start) * B_internal_cols + j * B_col_inc + B_col_start];
-}
-
diff --git a/auxiliary/matrix_row/align1/assign.cl b/auxiliary/matrix_row/align1/assign.cl
deleted file mode 100644
index af993e8..0000000
--- a/auxiliary/matrix_row/align1/assign.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-__kernel void assign( // A <- B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] 
-       = B[(i * B_row_inc + B_row_start) * B_internal_cols + j * B_col_inc + B_col_start];
-}
-
diff --git a/auxiliary/matrix_row/align1/clear.cl b/auxiliary/matrix_row/align1/clear.cl
deleted file mode 100644
index ffe4802..0000000
--- a/auxiliary/matrix_row/align1/clear.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void clear( // A <- 0
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] = 0;
-}
diff --git a/auxiliary/matrix_row/align1/cpu_inplace_mult.cl b/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
deleted file mode 100644
index 721e6e1..0000000
--- a/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void cpu_inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] *= factor;
-}
diff --git a/auxiliary/matrix_row/align1/inplace_add.cl b/auxiliary/matrix_row/align1/inplace_add.cl
deleted file mode 100644
index 01269b6..0000000
--- a/auxiliary/matrix_row/align1/inplace_add.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-__kernel void inplace_add( // A += B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] += 
-       B[(i * B_row_inc + B_row_start) * B_internal_cols + j * B_col_inc + B_col_start];
-}
-
diff --git a/auxiliary/matrix_row/align1/inplace_divide.cl b/auxiliary/matrix_row/align1/inplace_divide.cl
deleted file mode 100644
index cff4de0..0000000
--- a/auxiliary/matrix_row/align1/inplace_divide.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_divide( // A /= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] /= factor;
-}
diff --git a/auxiliary/matrix_row/align1/inplace_mult.cl b/auxiliary/matrix_row/align1/inplace_mult.cl
deleted file mode 100644
index 7758b51..0000000
--- a/auxiliary/matrix_row/align1/inplace_mult.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-__kernel void inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] *= factor;
-}
-
-
diff --git a/auxiliary/matrix_row/align1/inplace_sub.cl b/auxiliary/matrix_row/align1/inplace_sub.cl
deleted file mode 100644
index acc10ff..0000000
--- a/auxiliary/matrix_row/align1/inplace_sub.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-__kernel void inplace_sub( // A -= B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start] -=
-        B[(i * B_row_inc + B_row_start) * B_internal_cols + j * B_col_inc + B_col_start];
-}
diff --git a/auxiliary/matrix_row/align1/sub.cl b/auxiliary/matrix_row/align1/sub.cl
deleted file mode 100644
index 8fd4993..0000000
--- a/auxiliary/matrix_row/align1/sub.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-__kernel void sub(  // C = A - B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[(i * C_row_inc + C_row_start) * C_internal_cols + j * C_col_inc + C_col_start] = 
-        A[(i * A_row_inc + A_row_start) * A_internal_cols + j * A_col_inc + A_col_start]
-        - B[(i * B_row_inc + B_row_start) * B_internal_cols + j * B_col_inc + B_col_start];
-}
diff --git a/auxiliary/matrix_row/align1/trans_vec_mul.cl b/auxiliary/matrix_row/align1/trans_vec_mul.cl
deleted file mode 100644
index c02f621..0000000
--- a/auxiliary/matrix_row/align1/trans_vec_mul.cl
+++ /dev/null
@@ -1,29 +0,0 @@
-
-__kernel void trans_vec_mul(
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          __global float * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0))
-  {
-    float dot_prod = 0;
-    for (unsigned int col = 0; col < A_row_size; ++col)
-      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
-    result[row * result_inc + result_start] = dot_prod;
-  }
-}
-
diff --git a/auxiliary/matrix_row/align1/vec_mul.cl b/auxiliary/matrix_row/align1/vec_mul.cl
deleted file mode 100644
index fab3d36..0000000
--- a/auxiliary/matrix_row/align1/vec_mul.cl
+++ /dev/null
@@ -1,30 +0,0 @@
-
-__kernel void vec_mul(
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          __global float * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0))
-  {
-    float dot_prod = 0;
-    for (unsigned int col = 0; col < A_col_size; ++col)
-      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
-    result[row * result_inc + result_start] = dot_prod;
-  }
-}
-
-
diff --git a/auxiliary/nmf/align1/el_wise_mul_div.cl b/auxiliary/nmf/align1/el_wise_mul_div.cl
deleted file mode 100644
index d62ec9d..0000000
--- a/auxiliary/nmf/align1/el_wise_mul_div.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void el_wise_mul_div(
-          __global float * matrix1,
-          __global const float * matrix2,
-          __global const float * matrix3,
-          unsigned int size)
-{
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) 
-  {
-    float val = matrix1[i] * matrix2[i];
-    float divisor = matrix3[i];
-    matrix1[i] = (divisor > 0.00001) ? (val / divisor) : 0;
-  };
-};
diff --git a/auxiliary/nmf/align1/el_wise_mul_div.cl~ b/auxiliary/nmf/align1/el_wise_mul_div.cl~
deleted file mode 100644
index 79a5405..0000000
--- a/auxiliary/nmf/align1/el_wise_mul_div.cl~
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void el_wise_mul_div(
-          __global float * matrix1,
-          __global const float * matrix2,
-          __global const float * matrix3,
-          unsigned int size)
-{
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-       float val = matrix1[i] * matrix2[i];
-       float divisor = matrix3[i];
-       matrix1[i] = (divisor > 0.00001) ? (val / divisor) : 0;
-    };
-};
diff --git a/auxiliary/nmf/align1/sub_wise.cl b/auxiliary/nmf/align1/sub_wise.cl
deleted file mode 100644
index b1f2b42..0000000
--- a/auxiliary/nmf/align1/sub_wise.cl
+++ /dev/null
@@ -1,10 +0,0 @@
-
-__kernel void sub_wise(
-          __global const float * matrix1,
-          __global const float * matrix2,
-          __global float * result,
-          unsigned int size)
-{
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    result[i] = matrix1[i] - matrix2[i];
-}
diff --git a/auxiliary/nmf/align1/sub_wise.cl~ b/auxiliary/nmf/align1/sub_wise.cl~
deleted file mode 100644
index 79a5405..0000000
--- a/auxiliary/nmf/align1/sub_wise.cl~
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void el_wise_mul_div(
-          __global float * matrix1,
-          __global const float * matrix2,
-          __global const float * matrix3,
-          unsigned int size)
-{
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-       float val = matrix1[i] * matrix2[i];
-       float divisor = matrix3[i];
-       matrix1[i] = (divisor > 0.00001) ? (val / divisor) : 0;
-    };
-};
diff --git a/auxiliary/svd/align1/bidiag_pack.cl b/auxiliary/svd/align1/bidiag_pack.cl
deleted file mode 100644
index 6baa8ed..0000000
--- a/auxiliary/svd/align1/bidiag_pack.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-__kernel void bidiag_pack(__global float* A,
-                          __global float* D,
-                          __global float* S,
-                          uint size1,
-                          uint size2,
-                          uint stride
-                          ) {
-    uint size = min(size1, size2);
-
-    if(get_global_id(0) == 0)
-        S[0] = 0.0f;
-
-    for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) {
-        D[i] = A[i*stride + i];
-        S[i + 1] = (i + 1 < size2)?A[i*stride + (i + 1)]:0.0f;
-    }
-}
diff --git a/auxiliary/svd/align1/copy_col.cl b/auxiliary/svd/align1/copy_col.cl
deleted file mode 100644
index 6381d76..0000000
--- a/auxiliary/svd/align1/copy_col.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-// probably, this is a ugly way
-__kernel void copy_col(__global float* A,
-                       __global float* V,
-                       uint row_start,
-                       uint col_start,
-                       uint size,
-                       uint stride
-                       ) {
-    uint glb_id = get_global_id(0);
-    uint glb_sz = get_global_size(0);
-
-    for(uint i = row_start + glb_id; i < size; i += glb_sz) {
-        V[i - row_start] = A[i * stride + col_start];
-    }    
-}
diff --git a/auxiliary/svd/align1/copy_row.cl b/auxiliary/svd/align1/copy_row.cl
deleted file mode 100644
index 0d7303a..0000000
--- a/auxiliary/svd/align1/copy_row.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-// probably, this is too
-__kernel void copy_row(__global float* A,
-                       __global float* V,
-                       uint row_start,
-                       uint col_start,
-                       uint size,
-                       uint stride
-                       ) {
-    uint glb_id = get_global_id(0);
-    uint glb_sz = get_global_size(0);
-
-    for(uint i = col_start + glb_id; i < size; i += glb_sz) {
-        V[i - col_start] = A[row_start * stride + i];
-    }
-}
diff --git a/auxiliary/svd/align1/givens_prev.cl b/auxiliary/svd/align1/givens_prev.cl
deleted file mode 100644
index 8e62007..0000000
--- a/auxiliary/svd/align1/givens_prev.cl
+++ /dev/null
@@ -1,59 +0,0 @@
-
-
-__kernel void givens_prev(__global float* matr,
-                            __global float* cs,
-                            __global float* ss,
-                            uint size,
-                            uint stride,
-                            uint start_i,
-                            uint end_i
-                            )
-{
-    uint glb_id = get_global_id(0);
-    uint glb_sz = get_global_size(0);
-
-    uint lcl_id = get_local_id(0);
-    uint lcl_sz = get_local_size(0);
-
-    uint j = glb_id;
-
-    __local float cs_lcl[256];
-    __local float ss_lcl[256];
-
-    float x = (j < size)?matr[(start_i - 1) * stride + j]:0;
-
-    uint elems_num = end_i - start_i;
-    uint block_num = (elems_num + lcl_sz - 1) / lcl_sz;
-
-    for(uint block_id = 0; block_id < block_num; block_id++)
-    {
-        uint to = min(elems_num - block_id * lcl_sz, lcl_sz);
-
-        if(lcl_id < to)
-        {
-            cs_lcl[lcl_id] = cs[lcl_id + start_i + block_id * lcl_sz];
-            ss_lcl[lcl_id] = ss[lcl_id + start_i + block_id * lcl_sz];
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(j < size)
-        {
-            for(uint ind = 0; ind < to; ind++)
-            {
-                uint i = ind + start_i + block_id * lcl_sz;
-
-                float z = matr[i * stride + j];
-
-                float cs_val = cs_lcl[ind];//cs[i];
-                float ss_val = ss_lcl[ind];//ss[i];
-
-                matr[(i - 1) * stride + j] = x * cs_val + z * ss_val;
-                x = -x * ss_val + z * cs_val;
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(j < size)
-        matr[(end_i - 1) * stride + j] = x;
-}
diff --git a/auxiliary/svd/align1/house_col.cl b/auxiliary/svd/align1/house_col.cl
deleted file mode 100644
index 1a4861a..0000000
--- a/auxiliary/svd/align1/house_col.cl
+++ /dev/null
@@ -1,59 +0,0 @@
-
-// calculates a sum of local array elements
-void col_reduce_lcl_array(__local float* sums, uint lcl_id, uint lcl_sz) {
-    uint step = lcl_sz >> 1;
-
-    while(step > 0) {
-        if(lcl_id < step) {
-            sums[lcl_id] += sums[lcl_id + step];
-        }
-        step >>= 1;
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-__kernel void house_col(__global float* A,
-                        __global float* QL,
-                        __constant float* V, //householder vector
-                        uint row_start,
-                        uint col_start,
-                        uint size1,
-                        uint size2,
-                        uint stride,
-                        uint strideQ,
-                        __local float* sums
-                        ) {
-    uint glb_id = get_global_id(0);
-    uint glb_sz = get_global_size(0);
-
-    uint grp_id = get_group_id(0);
-    uint grp_nm = get_num_groups(0);
-
-    uint lcl_id = get_local_id(0);
-    uint lcl_sz = get_local_size(0);
-
-    float ss = 0.0f;
-    // update of left matrix
-    for(uint i = grp_id; i < size1; i += grp_nm) {
-        ss = 0.0f;
-        for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i * strideQ + j]);
-        sums[lcl_id] = ss;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        col_reduce_lcl_array(sums, lcl_id, lcl_sz);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        float sum_Qv = sums[0];
-
-        for(uint j = lcl_id; j < size1; j += lcl_sz)
-            QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
-    }
-    // doing it in slightly different way to avoid cache misses
-    for(uint i = glb_id + col_start; i < size2; i += glb_sz) {
-        ss = 0.0f;
-        for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j * stride + i]);
-
-        for(uint j = row_start; j < size1; j++)
-            A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
-    }
-}
diff --git a/auxiliary/svd/align1/house_row.cl b/auxiliary/svd/align1/house_row.cl
deleted file mode 100644
index a37b2fe..0000000
--- a/auxiliary/svd/align1/house_row.cl
+++ /dev/null
@@ -1,71 +0,0 @@
-// calculates a sum of local array elements
-void row_reduce_lcl_array(__local float* sums, uint lcl_id, uint lcl_sz) {
-    uint step = lcl_sz >> 1;
-
-    while(step > 0) {
-        if(lcl_id < step) {
-            sums[lcl_id] += sums[lcl_id + step];
-        }
-        step >>= 1;
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-
-__kernel void house_row(__global float* A,
-                        __global float* QR,
-                        __global float* V, // householder vector
-                        uint row_start,
-                        uint col_start,
-                        uint size1,
-                        uint size2,
-                        uint stride,
-                        uint strideQ,
-                        __local float* sums
-                        ) {
-
-    uint glb_id = get_global_id(0);
-
-    uint grp_id = get_group_id(0);
-    uint grp_nm = get_num_groups(0);
-
-    uint lcl_id = get_local_id(0);
-    uint lcl_sz = get_local_size(0);
-
-    float ss = 0.0f;
-
-    // update of QR matrix
-    // Actually, we are calculating a transpose of right matrix. This allows to avoid cache
-    // misses. 
-    for(uint i = grp_id; i < size2; i += grp_nm) {
-        ss = 0.0f;
-        for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * QR[i * strideQ + j]);
-        sums[lcl_id] = ss;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        row_reduce_lcl_array(sums, lcl_id, lcl_sz);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        float sum_Qv = sums[0];
-        for(uint j = lcl_id; j < size2; j += lcl_sz)
-            QR[i * strideQ + j] = QR[i * strideQ + j] - (2 * V[j] * sum_Qv);
-    }
-
-    // update of A matrix
-    for(uint i = grp_id + row_start; i < size1; i += grp_nm) {
-        ss = 0.0f;
-
-        for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i * stride + j]);
-        sums[lcl_id] = ss;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        row_reduce_lcl_array(sums, lcl_id, lcl_sz);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        float sum_Av = sums[0];
-
-        for(uint j = lcl_id; j < size2; j += lcl_sz)
-            A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
-    }
-}
-
diff --git a/auxiliary/svd/align1/inverse_signs.cl b/auxiliary/svd/align1/inverse_signs.cl
deleted file mode 100644
index 223539f..0000000
--- a/auxiliary/svd/align1/inverse_signs.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-
-
-__kernel void inverse_signs(__global float* v,
-                            __global float* signs,
-                            uint size,
-                            uint stride 
-                            )
-{
-    uint glb_id_x = get_global_id(0);
-    uint glb_id_y = get_global_id(1);
-
-    if((glb_id_x < size) && (glb_id_y < size))
-        v[glb_id_x * stride + glb_id_y] *= signs[glb_id_x];
-}
-
diff --git a/auxiliary/svd/align1/transpose_inplace.cl b/auxiliary/svd/align1/transpose_inplace.cl
deleted file mode 100644
index 521152b..0000000
--- a/auxiliary/svd/align1/transpose_inplace.cl
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-__kernel void transpose_inplace(__global float* input,
-                        unsigned int row_num,
-                        unsigned int col_num) {
-    unsigned int size = row_num * col_num;
-    for(unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) {
-        unsigned int row = i / col_num;
-        unsigned int col = i - row*col_num;
-
-        unsigned int new_pos = col * row_num + row;
-
-        //new_pos = col < row?0:1;
-        //input[i] = new_pos;
-
-        if(i < new_pos) {
-            float val = input[i];
-            input[i] = input[new_pos];
-            input[new_pos] = val;
-        }
-    }
-}
-
-
diff --git a/auxiliary/vector/align1/add.cl b/auxiliary/vector/align1/add.cl
deleted file mode 100644
index 591cb54..0000000
--- a/auxiliary/vector/align1/add.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-
-__kernel void add(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] + vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/assign.cl b/auxiliary/vector/align1/assign.cl
deleted file mode 100644
index 05dba1a..0000000
--- a/auxiliary/vector/align1/assign.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void assign(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] = vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/clear.cl b/auxiliary/vector/align1/clear.cl
deleted file mode 100644
index 1dc93e1..0000000
--- a/auxiliary/vector/align1/clear.cl
+++ /dev/null
@@ -1,11 +0,0 @@
-
-__kernel void clear(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i*inc1+start1] = 0;
-}
-
diff --git a/auxiliary/vector/align1/cpu_inplace_mul_add.cl b/auxiliary/vector/align1/cpu_inplace_mul_add.cl
deleted file mode 100644
index fb12b39..0000000
--- a/auxiliary/vector/align1/cpu_inplace_mul_add.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void cpu_inplace_mul_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] += vec2[i*inc2+start2] * factor;
-}
-
diff --git a/auxiliary/vector/align1/cpu_inplace_mult.cl b/auxiliary/vector/align1/cpu_inplace_mult.cl
deleted file mode 100644
index df5eca0..0000000
--- a/auxiliary/vector/align1/cpu_inplace_mult.cl
+++ /dev/null
@@ -1,12 +0,0 @@
-
-__kernel void cpu_inplace_mult(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i*inc1+start1] *= factor;
-}
-
diff --git a/auxiliary/vector/align1/cpu_mul_add.cl b/auxiliary/vector/align1/cpu_mul_add.cl
deleted file mode 100644
index 7c02900..0000000
--- a/auxiliary/vector/align1/cpu_mul_add.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-__kernel void cpu_mul_add(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          float factor,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3
-          ) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] * factor + vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/cpu_mult.cl b/auxiliary/vector/align1/cpu_mult.cl
deleted file mode 100644
index e25e4da..0000000
--- a/auxiliary/vector/align1/cpu_mult.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void cpu_mult(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          float factor, 
-          __global float * result,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc2+start2] = vec[i*inc1+start1] * factor;
-}
-
-
diff --git a/auxiliary/vector/align1/diag_precond.cl b/auxiliary/vector/align1/diag_precond.cl
deleted file mode 100644
index abaa6f3..0000000
--- a/auxiliary/vector/align1/diag_precond.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void diag_precond(
-          __global const float * diag_A_inv, 
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global float * x, 
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    x[i*inc2+start2] *= diag_A_inv[i*inc1+start1];
-}
diff --git a/auxiliary/vector/align1/divide.cl b/auxiliary/vector/align1/divide.cl
deleted file mode 100644
index f013c53..0000000
--- a/auxiliary/vector/align1/divide.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-// Note: name 'div' is not allowed by the jit-compiler
-__kernel void divide(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac,  //note: CPU variant is mapped to prod_scalar
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3)  
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc1+start3] = vec[i*inc1+start1] / factor;
-}
-
diff --git a/auxiliary/vector/align1/index_norm_inf.cl b/auxiliary/vector/align1/index_norm_inf.cl
deleted file mode 100644
index a3e415b..0000000
--- a/auxiliary/vector/align1/index_norm_inf.cl
+++ /dev/null
@@ -1,58 +0,0 @@
-//index_norm_inf:
-unsigned int float_vector1_index_norm_inf_impl(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * float_buffer,
-          __local unsigned int * index_buffer)
-{
-  //step 1: fill buffer:
-  float cur_max = 0.0f;
-  float tmp;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-  {
-    tmp = fabs(vec[i*inc1+start1]);
-    if (cur_max < tmp)
-    {
-      float_buffer[get_global_id(0)] = tmp;
-      index_buffer[get_global_id(0)] = i;
-      cur_max = tmp;
-    }
-  }
-  
-  //step 2: parallel reduction:
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-    {
-      //find the first occurring index
-      if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])
-      {
-        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];
-        float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];
-      }
-      
-      //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];
-      //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);
-    }
-  }
-  
-  return index_buffer[0];
-}
-
-__kernel void index_norm_inf(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * float_buffer,
-          __local unsigned int * index_buffer,
-          global unsigned int * result) 
-{ 
-  unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, inc1, size1, float_buffer, index_buffer);
-  if (get_global_id(0) == 0) *result = tmp;
-}
-
-
diff --git a/auxiliary/vector/align1/inner_prod.cl b/auxiliary/vector/align1/inner_prod.cl
deleted file mode 100644
index 37e3714..0000000
--- a/auxiliary/vector/align1/inner_prod.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-
-//helper:
-void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < stride)
-      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];
-  }
-}
-
-//////// inner products:
-float impl_inner_prod(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))
-    tmp += vec1[i*inc1+start1] * vec2[i*inc2+start2];
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_inner_prod_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-}
-
-
-__kernel void inner_prod(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_inner_prod(vec1,
-                              (      get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1,
-                              inc1,
-                              ((get_group_id(0) + 1) * size1) / get_num_groups(0)
-                                - (      get_group_id(0) * size1) / get_num_groups(0),
-                              vec2,
-                              (      get_group_id(0) * size2) / get_num_groups(0) * inc2 + start2,
-                              inc2,
-                              ((get_group_id(0) + 1) * size2) / get_num_groups(0)
-                                - (      get_group_id(0) * size2) / get_num_groups(0),
-                              tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;
-  
-}
-
diff --git a/auxiliary/vector/align1/inplace_add.cl b/auxiliary/vector/align1/inplace_add.cl
deleted file mode 100644
index d9e55f4..0000000
--- a/auxiliary/vector/align1/inplace_add.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void inplace_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] += vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/inplace_div_add.cl b/auxiliary/vector/align1/inplace_div_add.cl
deleted file mode 100644
index 6df0494..0000000
--- a/auxiliary/vector/align1/inplace_div_add.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-///// divide add:
-__kernel void inplace_div_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2] / factor;
-}
\ No newline at end of file
diff --git a/auxiliary/vector/align1/inplace_div_sub.cl b/auxiliary/vector/align1/inplace_div_sub.cl
deleted file mode 100644
index 1c23d6e..0000000
--- a/auxiliary/vector/align1/inplace_div_sub.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-///// divide substract:
-__kernel void inplace_div_sub(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2] / factor;
-}
-
diff --git a/auxiliary/vector/align1/inplace_divide.cl b/auxiliary/vector/align1/inplace_divide.cl
deleted file mode 100644
index b940b01..0000000
--- a/auxiliary/vector/align1/inplace_divide.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void inplace_divide(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac)  //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i*inc1+start1] /= factor;
-}
-
diff --git a/auxiliary/vector/align1/inplace_mul_add.cl b/auxiliary/vector/align1/inplace_mul_add.cl
deleted file mode 100644
index 20c61b8..0000000
--- a/auxiliary/vector/align1/inplace_mul_add.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_mul_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] += vec2[i*inc2+start2] * factor;
-}
-
-
diff --git a/auxiliary/vector/align1/inplace_mul_sub.cl b/auxiliary/vector/align1/inplace_mul_sub.cl
deleted file mode 100644
index 0882e1c..0000000
--- a/auxiliary/vector/align1/inplace_mul_sub.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_mul_sub(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2] * factor;
-}
-
-
diff --git a/auxiliary/vector/align1/inplace_mult.cl b/auxiliary/vector/align1/inplace_mult.cl
deleted file mode 100644
index be10b5b..0000000
--- a/auxiliary/vector/align1/inplace_mult.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void inplace_mult(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i*inc1+start1] *= factor;
-}
-
-
diff --git a/auxiliary/vector/align1/inplace_sub.cl b/auxiliary/vector/align1/inplace_sub.cl
deleted file mode 100644
index 31abe48..0000000
--- a/auxiliary/vector/align1/inplace_sub.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void inplace_sub(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/mul_add.cl b/auxiliary/vector/align1/mul_add.cl
deleted file mode 100644
index 5084168..0000000
--- a/auxiliary/vector/align1/mul_add.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-__kernel void mul_add(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3
-          ) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] * factor + vec2[i*inc2+start2];
-}
-
-
diff --git a/auxiliary/vector/align1/mul_sub.cl b/auxiliary/vector/align1/mul_sub.cl
deleted file mode 100644
index bc11d3c..0000000
--- a/auxiliary/vector/align1/mul_sub.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-///// multiply subtract:
-__kernel void mul_sub(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3
-          ) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] * factor - vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/mult.cl b/auxiliary/vector/align1/mult.cl
deleted file mode 100644
index b6b302e..0000000
--- a/auxiliary/vector/align1/mult.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void mult(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac, 
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc3+start3] = vec[i*inc1+start1] * factor;
-}
-
diff --git a/auxiliary/vector/align1/norm_1.cl b/auxiliary/vector/align1/norm_1.cl
deleted file mode 100644
index 341457d..0000000
--- a/auxiliary/vector/align1/norm_1.cl
+++ /dev/null
@@ -1,49 +0,0 @@
-//helper:
-void helper_norm1_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-}
-
-////// norm_1
-float impl_norm_1(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))
-    tmp += fabs(vec[i*inc1 + start1]);
-  
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_norm1_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-};
-
-__kernel void norm_1(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_norm_1(vec,
-                          (      get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1,
-                          inc1,
-                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) 
-                            - (      get_group_id(0) * size1) / get_num_groups(0),
-                          tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;  
-}
-
diff --git a/auxiliary/vector/align1/norm_2.cl b/auxiliary/vector/align1/norm_2.cl
deleted file mode 100644
index 1e9b2fc..0000000
--- a/auxiliary/vector/align1/norm_2.cl
+++ /dev/null
@@ -1,52 +0,0 @@
-//helper:
-void helper_norm2_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-}
-
-////// norm_2
-float impl_norm_2(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  float vec_entry = 0;
-  for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))
-  {
-    vec_entry = vec[i*inc1 + start1];
-    tmp += vec_entry * vec_entry;
-  }
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_norm2_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-};
-
-__kernel void norm_2(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_norm_2(vec,
-                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,
-                          inc1,
-                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) 
-                            - (      get_group_id(0) * size1) / get_num_groups(0),
-                          tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;  
-}
-
diff --git a/auxiliary/vector/align1/norm_inf.cl b/auxiliary/vector/align1/norm_inf.cl
deleted file mode 100644
index 41a5dff..0000000
--- a/auxiliary/vector/align1/norm_inf.cl
+++ /dev/null
@@ -1,43 +0,0 @@
-
-////// norm_inf
-float impl_norm_inf(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))
-    tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  //step 2: parallel reduction:
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);
-  }
-  
-  return tmp_buffer[0];
-}
-
-__kernel void norm_inf(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_norm_inf(vec,
-                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,
-                          inc1, 
-                          ((get_group_id(0) + 1) * size1) / get_num_groups(0)
-                            - (      get_group_id(0) * size1) / get_num_groups(0),
-                          tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;  
-}
diff --git a/auxiliary/vector/align1/plane_rotation.cl b/auxiliary/vector/align1/plane_rotation.cl
deleted file mode 100644
index d6b64d7..0000000
--- a/auxiliary/vector/align1/plane_rotation.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-
-////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)
-__kernel void plane_rotation(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global float * vec2, 
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          float alpha,
-          float beta) 
-{ 
-  float tmp1 = 0;
-  float tmp2 = 0;
-
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-  {
-    tmp1 = vec1[i*inc1+start1];
-    tmp2 = vec2[i*inc2+start2];
-    
-    vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2;
-    vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1;
-  }
-
-}
-
diff --git a/auxiliary/vector/align1/sqrt_sum.cl b/auxiliary/vector/align1/sqrt_sum.cl
deleted file mode 100644
index 396ab24..0000000
--- a/auxiliary/vector/align1/sqrt_sum.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-// helper kernel for norm_2
-__kernel void sqrt_sum(
-          __global float * vec1,
-          unsigned int start1,  
-          unsigned int inc1,
-          unsigned int size1,
-          __global float * result) 
-{ 
-  //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    if (get_global_id(0) < stride)
-      vec1[get_global_id(0)*inc1+start1] += vec1[(get_global_id(0)+stride)*inc1+start1];
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-  
-  if (get_global_id(0) == 0)
-    *result = sqrt(vec1[start1]);
-  
-}
-
diff --git a/auxiliary/vector/align1/sub.cl b/auxiliary/vector/align1/sub.cl
deleted file mode 100644
index 28827ef..0000000
--- a/auxiliary/vector/align1/sub.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-
-__kernel void sub(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * vec2, 
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3)
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] - vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align1/sum.cl b/auxiliary/vector/align1/sum.cl
deleted file mode 100644
index ff0e1c0..0000000
--- a/auxiliary/vector/align1/sum.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-__kernel void sum(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global float * result) 
-{ 
-  //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    if (get_global_id(0) < stride)
-      vec1[get_global_id(0)*inc1+start1] += vec1[(get_global_id(0)+stride)*inc1+start1];
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-  
-  if (get_global_id(0) == 0)
-    *result = vec1[start1];  
-}
-
diff --git a/auxiliary/vector/align1/swap.cl b/auxiliary/vector/align1/swap.cl
deleted file mode 100644
index f0f0d81..0000000
--- a/auxiliary/vector/align1/swap.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-
-////// swap:
-__kernel void swap(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global float * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2
-          ) 
-{ 
-  float tmp;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-  {
-    tmp = vec2[i*inc2+start2];
-    vec2[i*inc2+start2] = vec1[i*inc1+start1];
-    vec1[i*inc1+start1] = tmp;
-  }
-}
- 
diff --git a/auxiliary/vector/align1/vmax.cl b/auxiliary/vector/align1/vmax.cl
deleted file mode 100644
index 9ce45d6..0000000
--- a/auxiliary/vector/align1/vmax.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-__kernel void vmax(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global float * result) 
-{ 
-  //parallel reduction on global memory (make sure that size is a power of 2)
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    if (get_global_id(0) < stride)
-      vec1[get_global_id(0)*inc1+start1] = fmax(vec1[(get_global_id(0)+stride)*inc1+start1],
-                                                vec1[get_global_id(0)*inc1+start1]);
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-  
-  if (get_global_id(0) == 0)
-    *result = vec1[start1];
-}
-
diff --git a/auxiliary/vector/align16/add.cl b/auxiliary/vector/align16/add.cl
deleted file mode 100644
index 1995acd..0000000
--- a/auxiliary/vector/align16/add.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-__kernel void add(
-          __global const float16 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float16 * vec2, 
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float16 * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3)
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] + vec2[i*inc2+start2];
-}
-
-
diff --git a/auxiliary/vector/align16/cpu_inplace_mul.cl b/auxiliary/vector/align16/cpu_inplace_mul.cl
deleted file mode 100644
index f271f56..0000000
--- a/auxiliary/vector/align16/cpu_inplace_mul.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void cpu_inplace_mult(
-          __global float16 * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          float factor) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec[i*inc1+start1] *= factor;
-}
-
diff --git a/auxiliary/vector/align16/cpu_mult.cl b/auxiliary/vector/align16/cpu_mult.cl
deleted file mode 100644
index 5241da5..0000000
--- a/auxiliary/vector/align16/cpu_mult.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void cpu_mult(
-          __global const float16 * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          float factor, 
-          __global float16 * result,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc2+start2] = vec[i*inc1+start1] * factor;
-}
-
diff --git a/auxiliary/vector/align16/divide.cl b/auxiliary/vector/align16/divide.cl
deleted file mode 100644
index 0fc3ddb..0000000
--- a/auxiliary/vector/align16/divide.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-//Note: 'div' cannot be used because of complaints by the jit-compiler
-__kernel void divide(
-          __global const float16 * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac,  //note: CPU variant is mapped to prod_scalar
-          __global float16 * result,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2)  
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc2+start2] = vec[i*inc1+start1] / factor;
-}
-
-
diff --git a/auxiliary/vector/align16/inplace_add.cl b/auxiliary/vector/align16/inplace_add.cl
deleted file mode 100644
index a93dad2..0000000
--- a/auxiliary/vector/align16/inplace_add.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void inplace_add(
-          __global float16 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float16 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i*inc1+start1] += vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align16/inplace_divide.cl b/auxiliary/vector/align16/inplace_divide.cl
deleted file mode 100644
index 76741c1..0000000
--- a/auxiliary/vector/align16/inplace_divide.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-__kernel void inplace_divide(
-          __global float16 * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac)  //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec[i*inc1+start1] /= factor;
-}
-
diff --git a/auxiliary/vector/align16/inplace_mult.cl b/auxiliary/vector/align16/inplace_mult.cl
deleted file mode 100644
index 6e5edfe..0000000
--- a/auxiliary/vector/align16/inplace_mult.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void inplace_mult(
-          __global float16 * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec[i*inc1+start1] *= factor;
-}
-
diff --git a/auxiliary/vector/align16/inplace_sub.cl b/auxiliary/vector/align16/inplace_sub.cl
deleted file mode 100644
index e452e3f..0000000
--- a/auxiliary/vector/align16/inplace_sub.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void inplace_sub(
-          __global float16 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float16 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2];
-}
-
-
diff --git a/auxiliary/vector/align16/mult.cl b/auxiliary/vector/align16/mult.cl
deleted file mode 100644
index d55e667..0000000
--- a/auxiliary/vector/align16/mult.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void mult(
-          __global const float16 * vec,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac, 
-          __global float16 * result,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2) 
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc2+start2] = vec[i*inc1+start1] * factor;
-}
-
diff --git a/auxiliary/vector/align16/sub.cl b/auxiliary/vector/align16/sub.cl
deleted file mode 100644
index 93e8077..0000000
--- a/auxiliary/vector/align16/sub.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-__kernel void sub(
-          __global const float16 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float16 * vec2, 
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float16 * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3)
-{ 
-  unsigned int i_end = size1 / 16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] - vec2[i*inc2+start2];
-}
-
-
diff --git a/auxiliary/vector/align4/cpu_inplace_mul_add.cl b/auxiliary/vector/align4/cpu_inplace_mul_add.cl
deleted file mode 100644
index c71f129..0000000
--- a/auxiliary/vector/align4/cpu_inplace_mul_add.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void cpu_inplace_mul_add(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          float factor) 
-{ 
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i*inc1+start1] += vec2[i*inc2+start2] * factor;
-}
-
diff --git a/auxiliary/vector/align4/cpu_mul_add.cl b/auxiliary/vector/align4/cpu_mul_add.cl
deleted file mode 100644
index 53a4a9f..0000000
--- a/auxiliary/vector/align4/cpu_mul_add.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-__kernel void cpu_mul_add(
-          __global const float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          float factor,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float4 * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3) 
-{ 
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] * factor + vec2[i*inc2+start2];
-}
-
diff --git a/auxiliary/vector/align4/inplace_div_add.cl b/auxiliary/vector/align4/inplace_div_add.cl
deleted file mode 100644
index 4cbf33d..0000000
--- a/auxiliary/vector/align4/inplace_div_add.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-__kernel void inplace_div_add(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1 / 4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2] / factor;
-}
-
-
-
diff --git a/auxiliary/vector/align4/inplace_div_sub.cl b/auxiliary/vector/align4/inplace_div_sub.cl
deleted file mode 100644
index 3f73162..0000000
--- a/auxiliary/vector/align4/inplace_div_sub.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-__kernel void inplace_div_sub(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2] / factor;
-}
-
-
diff --git a/auxiliary/vector/align4/inplace_mul_add.cl b/auxiliary/vector/align4/inplace_mul_add.cl
deleted file mode 100644
index 96618c9..0000000
--- a/auxiliary/vector/align4/inplace_mul_add.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_mul_add(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  unsigned int size_div_4 = size1/4;
-  for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))
-    vec1[i*inc1+start1] += vec2[i*inc2+start2] * factor;
-}
-
diff --git a/auxiliary/vector/align4/inplace_mul_sub.cl b/auxiliary/vector/align4/inplace_mul_sub.cl
deleted file mode 100644
index 4c8c3bc..0000000
--- a/auxiliary/vector/align4/inplace_mul_sub.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-
-__kernel void inplace_mul_sub(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i*inc1+start1] -= vec2[i*inc2+start2] * factor;
-}
-
-
diff --git a/auxiliary/vector/align4/mul_add.cl b/auxiliary/vector/align4/mul_add.cl
deleted file mode 100644
index 0b074ae..0000000
--- a/auxiliary/vector/align4/mul_add.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void mul_add(
-          __global const float4 * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          __global const float * fac,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2,
-          __global float4 * result,
-          unsigned int start3,
-          unsigned int inc3,
-          unsigned int size3) 
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i*inc3+start3] = vec1[i*inc1+start1] * factor + vec2[i*inc2+start2];
-}
-
diff --git a/examples/tutorial/iterative-ublas.cpp~ b/examples/tutorial/iterative-ublas.cpp~
deleted file mode 100644
index a734afc..0000000
--- a/examples/tutorial/iterative-ublas.cpp~
+++ /dev/null
@@ -1,163 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// Necessary to obtain a suitable performance in ublas
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-/*
-*
-*   Tutorial:  Iterative solvers without OpenCL
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs;
-  ublas::vector<ScalarType> rhs2;
-  ublas::vector<ScalarType> ref_result;
-  ublas::vector<ScalarType> result;
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
-  //
-  // Read system from file
-  //
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading matrix" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-  #else
-  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading rhs" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading result" << std::endl;
-
-  
-  //
-  // set up ILUT preconditioners for ViennaCL and ublas objects. Other preconditioners can also be used (see manual)
-  // 
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
-                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  
-  //
-  // Conjugate gradient solver:
-  //
-  std::cout << "----- CG Test -----" << std::endl;
-
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilu0);
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_block_ilu0);
-
-  
-  //
-  // Stabilized BiConjugate gradient solver:
-  //
-  std::cout << "----- BiCGStab Test -----" << std::endl;
-
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilu0); //with preconditioner
-  
-  //
-  // GMRES solver:
-  //
-  std::cout << "----- GMRES Test -----" << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilu0);//with preconditioner
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
diff --git a/tests/src/generator_inner_product.cpp b/tests/src/generator_inner_product.cpp
deleted file mode 100644
index ecca138..0000000
--- a/tests/src/generator_inner_product.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//
-// *** System
-//
-#include <iostream>
-
-//
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-
-
-//
-// *** ViennaCL
-//
-// #define VIENNACL_DEBUG_ALL
-// #define VIENNACL_DEBUG_BUILD
-// #define VIENNACL_HAVE_UBLAS 1
-// #define VIENNACL_DEBUG_CUSTOM_OPERATION
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-#include "viennacl/generator/custom_operation.hpp"
-
-using namespace boost::numeric;
-
-template <class TYPE>
-bool readVectorFromFile ( const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec ) {
-    std::ifstream file ( filename.c_str() );
-
-    if ( !file ) return false;
-
-    unsigned int size;
-    file >> size;
-
-    if ( size > 20000 )  //keep execution times short
-        size = 20000;
-    vec.resize ( size );
-    for ( unsigned int i = 0; i < size; ++i ) {
-        TYPE element;
-        file >> element;
-        vec[i] = element;
-    }
-
-    return true;
-}
-
-template <typename ScalarType>
-ScalarType diff ( ScalarType & s1, viennacl::scalar<ScalarType> & s2 ) {
-    if ( s1 != s2 )
-        return ( s1 - s2 ) / std::max ( fabs ( s1 ), fabs ( s2 ) );
-    return 0;
-}
-
-template< typename NumericT,unsigned int Alignment, typename Epsilon >
-int test ( Epsilon const& epsilon, std::string vecfile, std::string resultfile ) {
-    int retval = EXIT_SUCCESS;
-
-    viennacl::scalar<NumericT>  vcl_res ( 0 );
-    ublas::vector<NumericT> vec;
-    ublas::vector<NumericT> vec2;
-
-    NumericT res;
-
-    viennacl::generator::gpu_symbolic_scalar<0,NumericT> symres;
-    viennacl::generator::symbolic_vector<1,NumericT,Alignment> symv;
-    viennacl::generator::symbolic_vector<2,NumericT,Alignment> symv2;
-    viennacl::generator::cpu_symbolic_scalar<3,NumericT> symscal;
-    viennacl::generator::cpu_symbolic_scalar<2,NumericT> symscal2;
-
-
-    if ( !readVectorFromFile<NumericT> ( vecfile, vec ) ) {
-        std::cout << "Error reading vec file" << std::endl;
-        retval = EXIT_FAILURE;
-    }
-// 
-    std::cout << "Running tests for vector of size " << vec.size() << std::endl;
-	std::cout << "----- Alignment " << Alignment << " -----" << std::endl;
-// 
-    viennacl::vector<NumericT,Alignment> vcl_vec ( vec.size() );
-    viennacl::vector<NumericT,Alignment> vcl_vec2 ( vec.size() );
-// 
-    vec2 = vec;
-    viennacl::copy ( vec.begin(), vec.end(), vcl_vec.begin() );
-    viennacl::copy ( vec2.begin(), vec2.end(), vcl_vec2.begin() );
-
-//     --------------------------------------------------------------------------
-
-    std::cout << "testing inner product..." << std::endl;
-	
-    res = ublas::inner_prod ( vec, vec2 );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symres = inner_prod ( symv, symv2 ) ) ( vcl_res, vcl_vec, vcl_vec2 ) );
-	if ( fabs ( diff ( res, vcl_res ) ) > epsilon ) {
-        std::cout << "# Error at operation: inner product" << std::endl;
-        std::cout << "  Diff " << fabs ( diff ( res, vcl_res ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inner product division..." << std::endl;
-    res = ublas::inner_prod ( vec, vec2 ) /ublas::inner_prod ( vec, vec );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symres = inner_prod ( symv, symv2 ) /inner_prod ( symv,symv ) ) ( vcl_res, vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( res, vcl_res ) ) > epsilon ) {
-        std::cout << "# Error at operation: inner product" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( res, vcl_res ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing scalar over inner product..." << std::endl;
-    res = 4/ublas::inner_prod ( vec, vec );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symres = symscal2/inner_prod ( symv,symv ) ) ( vcl_res, vcl_vec, 4.0f ) );
-    if ( fabs ( diff ( res, vcl_res ) ) > epsilon ) {
-        std::cout << "# Error at operation: scalar over inner product" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( res, vcl_res ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inner_prod minus ( scal minus inner_prod ) " << std::endl;
-    res = ublas::inner_prod ( vec, vec2 ) - ( 5.0f - inner_prod ( vec,vec2 ) );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symres = inner_prod ( symv, symv2 ) - ( symscal - inner_prod ( symv,symv2 ) ) ) ( vcl_res, vcl_vec, vcl_vec2, 5.0f ) );
-    if ( fabs ( diff ( res, vcl_res ) ) > epsilon ) {
-        std::cout << "# Error at operation: inner_prod minus ( scal minus inner_prod ) " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( res, vcl_res ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing nested inner product" << std::endl;
-    res = ublas::inner_prod ( vec, ublas::inner_prod ( vec,vec2 ) * vec2 );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symres = inner_prod ( symv, inner_prod ( symv,symv2 ) * symv2 ) ) ( vcl_res, vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( res, vcl_res ) ) > epsilon ) {
-        std::cout << "# Error at operation: nested inner product"  << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( res, vcl_res ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    return retval;
-}
-
-
-int main() {
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "## Test :: Inner Product" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-
-    int retval = EXIT_SUCCESS;
-
-    std::string vecfile ( "../examples/testdata/rhs65025.txt" );
-    std::string resultfile ( "../examples/testdata/result65025.txt" );
-
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-    {
-        typedef float NumericT;
-        NumericT epsilon = 1.0E-4;
-        std::cout << "# Testing setup:" << std::endl;
-        std::cout << "  eps:     " << epsilon << std::endl;
-        std::cout << "  numeric: float" << std::endl;
-        retval = test<NumericT,1> ( epsilon, vecfile, resultfile );
-//  		retval = test<NumericT,4> ( epsilon, vecfile, resultfile );
-        retval = test<NumericT,16> ( epsilon, vecfile, resultfile );
-        if ( retval == EXIT_SUCCESS )
-            std::cout << "# Test passed" << std::endl;
-        else
-            return retval;
-    }
-}
diff --git a/tests/src/generator_matrix.cpp b/tests/src/generator_matrix.cpp
deleted file mode 100644
index 751255f..0000000
--- a/tests/src/generator_matrix.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-//
-// *** System
-//
-#include <iostream>
-
-//
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-
-//
-// *** ViennaCL
-//
-//#define VIENNACL_DEBUG_ALL
-#define VIENNACL_HAVE_UBLAS 1
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "examples/tutorial/Random.hpp"
-#include "examples/benchmarks/benchmark-utils.hpp"
-#include "viennacl/generator/custom_operation.hpp"
-
-using namespace boost::numeric;
-
-const int matrix_size = 100;
-
-template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-ScalarType diff ( ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2 ) {
-    ublas::matrix<ScalarType> mat2_cpu ( mat2.size1(), mat2.size2() );
-    copy ( mat2, mat2_cpu );
-    ScalarType ret = 0;
-    ScalarType act = 0;
-
-    for ( unsigned int i = 0; i < mat2_cpu.size1(); ++i ) {
-        for ( unsigned int j = 0; j < mat2_cpu.size2(); ++j ) {
-            act = fabs ( mat2_cpu ( i,j ) - mat1 ( i,j ) ) / std::max ( fabs ( mat2_cpu ( i, j ) ), fabs ( mat1 ( i,j ) ) );
-            if ( act > ret )
-                ret = act;
-        }
-    }
-    //std::cout << ret << std::endl;
-    return ret;
-}
-
-template< typename NumericT, typename Epsilon >
-int test ( Epsilon const& epsilon ) {
-
-    int retval = EXIT_SUCCESS;
-
-    ublas::matrix<NumericT> mat ( matrix_size, matrix_size );
-
-    NumericT                    cpu_scal = static_cast<NumericT> ( 42.1415 );
-    viennacl::scalar<NumericT>  gpu_scal = static_cast<NumericT> ( 42.1415 );
-
-    viennacl::matrix<NumericT> vcl_mat ( matrix_size, matrix_size );
-    viennacl::matrix<NumericT> vcl_mat2 ( matrix_size, matrix_size );
-
-    viennacl::generator::symbolic_matrix<0,NumericT> symm;
-    viennacl::generator::symbolic_matrix<1,NumericT> symm2;
-
-    viennacl::generator::cpu_symbolic_scalar<1,NumericT> cpu_sym_scal2;
-
-    viennacl::generator::gpu_symbolic_scalar<1,NumericT> gpu_sym_scal2;
-
-    for ( unsigned int i = 0; i < mat.size1(); ++i )
-        for ( unsigned int j = 0; j < mat.size2(); ++j )
-            mat ( i,j ) = static_cast<NumericT> ( 0.1 ) * random<NumericT>();
-
-    ublas::matrix<NumericT> mat2 ( mat ) ;
-
-    viennacl::copy ( mat, vcl_mat );
-    viennacl::copy ( mat2, vcl_mat2 );
-
-    std::cout << "Testing addition..." << std::endl;
-    mat     = mat + mat2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm = symm + symm2 ) ( vcl_mat, vcl_mat2 ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "Testing inplace addition..." << std::endl;
-    mat     += mat2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm += symm2 ) ( vcl_mat, vcl_mat2 ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing substraction..." << std::endl;
-    mat     = mat - mat2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm = symm - symm2 ) ( vcl_mat, vcl_mat2 ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: substraction" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "Testing inplace substraction..." << std::endl;
-    mat     -= mat2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm -= symm2 ) ( vcl_mat, vcl_mat2 ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    // --------------------------------------------------------------------------
-
-    std::cout << "testing cpu scalar multiplication ..." << std::endl;
-    mat     = mat*cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm = symm*cpu_sym_scal2 ) ( vcl_mat, cpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: cpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace cpu scalar multiplication ..." << std::endl;
-    mat     *= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm *= cpu_sym_scal2 ) ( vcl_mat, cpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace cpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing cpu scalar division ..." << std::endl;
-    mat     = mat/cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm = symm/cpu_sym_scal2 ) ( vcl_mat, cpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: cpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace cpu scalar division ..." << std::endl;
-    mat     /= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm /= cpu_sym_scal2 ) ( vcl_mat, cpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace cpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing gpu scalar multiplication ..." << std::endl;
-    mat     = mat*cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm = symm*gpu_sym_scal2 ) ( vcl_mat, gpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: gpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace gpu scalar multiplication ..." << std::endl;
-    mat     *= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm *= gpu_sym_scal2 ) ( vcl_mat, gpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace gpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing gpu scalar division ..." << std::endl;
-    mat     = mat/cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm = symm/gpu_sym_scal2 ) ( vcl_mat, gpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: gpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace gpu scalar division ..." << std::endl;
-    mat     /= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symm /= gpu_sym_scal2 ) ( vcl_mat, gpu_scal ) );
-    if ( fabs ( diff ( mat, vcl_mat ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace gpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( mat, vcl_mat ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    return retval;
-
-}
-
-int main() {
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "## Test :: Matrix" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-
-    int retval = EXIT_SUCCESS;
-
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-    {
-        typedef float NumericT;
-        NumericT epsilon = 1.0E-3;
-        std::cout << "# Testing setup:" << std::endl;
-        std::cout << "  eps:     " << epsilon << std::endl;
-        std::cout << "  numeric: float" << std::endl;
-        std::cout << "  layout: row-major" << std::endl;
-        retval = test<NumericT> ( epsilon );
-        if ( retval == EXIT_SUCCESS )
-            std::cout << "# Test passed" << std::endl;
-        else
-            return retval;
-    }
-}
diff --git a/tests/src/generator_matrix_vector_product.cpp b/tests/src/generator_matrix_vector_product.cpp
deleted file mode 100644
index bd59f25..0000000
--- a/tests/src/generator_matrix_vector_product.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-// #define VIENNACL_DEBUG_CUSTOM_OPERATION
-
-//
-
-
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-
-
-
-//
-// *** ViennaCL
-//
-// #define VIENNACL_DEBUG_ALL
-// #define VIENNACL_DEBUG_BUILD
-// #define VIENNACL_HAVE_UBLAS 1
-// #define VIENNACL_DEBUG_CUSTOM_OPERATION
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "examples/tutorial/Random.hpp"
-#include "viennacl/generator/custom_operation.hpp"
-
-//
-// -------------------------------------------------------------
-//
-using namespace boost::numeric;
-//
-// -------------------------------------------------------------
-//
-
-template <typename ScalarType, unsigned int Alignment>
-ScalarType diff ( ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType,Alignment> & v2 ) {
-    ublas::vector<ScalarType> v2_cpu ( v2.size() );
-    viennacl::copy( v2.begin(), v2.end(), v2_cpu.begin() );
-    for ( unsigned int i=0; i<v1.size(); ++i ) {
-        if ( std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) ) > 0 )
-            v2_cpu[i] = fabs ( v2_cpu[i] - v1[i] ) / std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) );
-        else
-            v2_cpu[i] = 0.0;
-    }
-    return norm_inf ( v2_cpu );
-}
-
-template< typename NumericT,  typename F,typename F2, unsigned int Alignment, typename Epsilon >
-int test ( Epsilon const& epsilon ) {
-    int retval = EXIT_SUCCESS;
-    static const unsigned int SIZE = 100;
-    // --------------------------------------------------------------------------
-    ublas::vector<NumericT> rhs ( SIZE );
-    for ( unsigned int i = 0; i < rhs.size(); ++i )
-        rhs ( i ) = random<NumericT>();
-    ublas::vector<NumericT> rhs2 = rhs;
-    ublas::vector<NumericT> result = ublas::scalar_vector<NumericT> ( SIZE, 1 );
-    ublas::vector<NumericT> result2 = result;
-    ublas::vector<NumericT> rhs_trans = rhs;
-    rhs_trans.resize ( result.size(), true );
-    ublas::vector<NumericT> result_trans = ublas::zero_vector<NumericT> ( rhs.size() );
-
-
-    
-    ublas::matrix<NumericT,F2> matrix ( result.size(), rhs.size() );
-    for ( unsigned int i = 0; i < matrix.size1(); ++i )
-        for ( unsigned int j = 0; j < matrix.size2(); ++j )
-            matrix ( i,j ) =  random<NumericT>();
-
-
-	std::cout << "----- Alignment " << Alignment << " -----" << std::endl;
-
-    viennacl::vector<NumericT,Alignment> vcl_rhs ( rhs.size() );
-    viennacl::vector<NumericT,Alignment> vcl_rhs_trans ( rhs_trans.size() );
-    viennacl::vector<NumericT,Alignment> vcl_result_trans ( result_trans.size() );
-    viennacl::vector<NumericT,Alignment> vcl_result ( result.size() );
-    viennacl::matrix<NumericT, F, Alignment> vcl_matrix ( rhs.size(), rhs.size() );
-
-    viennacl::copy ( rhs.begin(), rhs.end(), vcl_rhs.begin() );
-    viennacl::copy ( result, vcl_result );
-    viennacl::copy ( matrix, vcl_matrix );
-
-    // --------------------------------------------------------------------------
-
-    viennacl::generator::symbolic_matrix<1,NumericT,F,Alignment> symm2;
-
-    viennacl::generator::symbolic_vector<0,NumericT,Alignment> symv;
-    viennacl::generator::symbolic_vector<2,NumericT,Alignment> symv3;
-
-    // --------------------------------------------------------------------------
-    std::cout << "matrix-vector product (no temporary)" << std::endl;
-    result     = ublas::prod ( matrix, rhs );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = prod ( symm2,symv3 ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: matrix-vector product (no temporary)" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-    
-    std::cout << "Prod times inprod (no temporary)" << std::endl;
-    result     = ublas::inner_prod ( rhs,rhs ) *ublas::prod ( matrix, rhs );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = inner_prod ( symv3,symv3 ) *prod ( symm2,symv3 ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: Prod times inprod" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "matrix-vector product (temporary)" << std::endl;
-    result     = ublas::prod ( matrix, result );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = prod ( symm2,symv ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: matrix-vector product (temporary)" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-// 
-    //--------------------------------------------------------------------------
-    std::cout << "prod minus ( v minus prod ) " << std::endl;
-    result     = ublas::prod ( matrix, rhs ) - ( rhs - ublas::prod ( matrix,rhs ) ) ;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = prod ( symm2,symv3 ) - ( symv3 - prod ( symm2,symv3 ) ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: prod minus ( v minus prod )" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-// 
-    //--------------------------------------------------------------------------
-    std::cout << "prod minus ( prod minus v ) " << std::endl;
-    result     = ublas::prod ( matrix, rhs ) - ( ublas::prod ( matrix,rhs ) - rhs ) ;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = prod ( symm2,symv3 ) - ( prod ( symm2,symv3 ) - symv3 ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: prod minus ( prod minus v )" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    //--------------------------------------------------------------------------
-    std::cout << "v minus ( prod minus v ) " << std::endl;
-    result     = rhs - ( ublas::prod ( matrix,rhs ) - rhs ) ;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv3 - ( prod ( symm2,symv3 ) - symv3 ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: v minus ( prod minus v )" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    //--------------------------------------------------------------------------
-    std::cout << "v minus ( v minus prod ) " << std::endl;
-    result     = rhs - ( rhs - ublas::prod ( matrix,rhs ) ) ;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv3 - ( symv3 - prod ( symm2,symv3 ) ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: v minus ( v minus prod )" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    //--------------------------------------------------------------------------
-    std::cout << "Nested matrix-vector product" << std::endl;
-    result     = ublas::prod ( matrix, ublas::vector<NumericT> ( ublas::prod ( matrix,result ) ) );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = prod ( symm2,prod ( symm2,symv ) ) ) ( vcl_result,vcl_matrix ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: nested matrix-vector product" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-
-//    --------------------------------------------------------------------------
-    std::cout << "Double nested matrix-vector product" << std::endl;
-    result	  = ublas::prod ( matrix,ublas::vector<NumericT> ( ublas::prod ( matrix, ublas::vector<NumericT> ( ublas::prod ( matrix,rhs ) ) ) ) );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = prod ( symm2,prod ( symm2,prod ( symm2,symv3 ) ) ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation: double nested matrix-vector product" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-
-    }
-    
-    /*std::cout << "Complicated mess..." << std::endl;
-    result     = result + ublas::prod ( matrix,result ) + ublas::inner_prod ( result, rhs ) *ublas::prod ( matrix,rhs );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + prod ( symm2,symv ) + inner_prod ( symv,symv3 ) *prod ( symm2,symv3 ) ) ( vcl_result,vcl_matrix,vcl_rhs ) );
-    if ( fabs ( diff ( result, vcl_result ) ) > epsilon ) {
-        std::cout << "# Error at operation : complicated mess" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( result, vcl_result ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }*/
-    
-    return retval;
-}
-int main() {
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "## Test :: Matrix-Vector Product" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-
-    int retval = EXIT_SUCCESS;
-
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-    {
-        typedef float NumericT;
-        NumericT epsilon = 1.0E-3;
-        std::cout << "# Testing setup:" << std::endl;
-        std::cout << "  eps:     " << epsilon << std::endl;
-        std::cout << "  numeric: float" << std::endl;
-        
-        std::cout << "---- Layout : Row Major" << std::endl;
-        retval = test<NumericT, viennacl::row_major,ublas::row_major,1> ( epsilon );
-        retval = test<NumericT, viennacl::row_major,ublas::row_major,16> ( epsilon );
-        
-        std::cout << "---- Layout : Column Major" << std::endl;
-        retval = test<NumericT, viennacl::column_major,ublas::column_major,1> ( epsilon );
-//         retval = test<NumericT, viennacl::column_major,ublas::column_major,16> ( epsilon );
-        
-        if ( retval == EXIT_SUCCESS )
-            std::cout << "# Test passed" << std::endl;
-        else
-            return retval;
-    }
-
-    return retval;
-}
diff --git a/tests/src/generator_vector.cpp b/tests/src/generator_vector.cpp
deleted file mode 100644
index 2cf3cab..0000000
--- a/tests/src/generator_vector.cpp
+++ /dev/null
@@ -1,331 +0,0 @@
-//
-// *** System
-//
-#include <iostream>
-
-//
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-#include <boost/foreach.hpp>
-
-//
-// *** ViennaCL
-//
-//#define VIENNACL_DEBUG_ALL
-#define VIENNACL_HAVE_UBLAS 1
-#define VIENNACL_DEBUG_CUSTOM_OPERATION
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-#include "viennacl/generator/custom_operation.hpp"
-#include "viennacl/generator/elementwise_modifier.hpp"
-#include "viennacl/generator/symbolic_types/convenience_typedef.hpp"
-
-using namespace boost::numeric;
-using namespace viennacl::generator;
-
-std::string my_modifier(){ return "1/exp(-X)" ; }
-
-template <class TYPE>
-bool readVectorFromFile ( const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec ) {
-    std::ifstream file ( filename.c_str() );
-
-    if ( !file ) return false;
-
-    unsigned int size;
-    file >> size;
-
-    if ( size > 20000 )  //keep execution times short
-        size = 20000;
-    vec.resize ( size );
-
-    for ( unsigned int i = 0; i < size; ++i ) {
-        TYPE element;
-        file >> element;
-        vec[i] = element;
-    }
-
-    return true;
-}
-
-template <typename ScalarType, unsigned int Alignment>
-ScalarType diff ( ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType,Alignment> & v2 ) {
-    ublas::vector<ScalarType> v2_cpu ( v2.size() );
-    viennacl::copy( v2.begin(), v2.end(), v2_cpu.begin() );
-    for ( unsigned int i=0; i<v1.size(); ++i ) {
-        if ( std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) ) > 0 )
-            v2_cpu[i] = fabs ( v2_cpu[i] - v1[i] ) / std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) );
-        else
-            v2_cpu[i] = 0.0;
-    }
-    return norm_inf ( v2_cpu );
-}
-
-template< typename NumericT, unsigned int Alignment, typename Epsilon >
-int test ( Epsilon const& epsilon, std::string vecfile, std::string resultfile ) {
-    int retval = EXIT_SUCCESS;
-
-	
-    ublas::vector<NumericT> vec;
-    ublas::vector<NumericT> vec2;
-
-    NumericT                    cpu_scal = static_cast<NumericT> ( 42.1415 );
-    viennacl::scalar<NumericT>  gpu_scal = static_cast<NumericT> ( 42.1415 );
-
-    viennacl::generator::symbolic_vector<0,NumericT,Alignment> symv;
-    viennacl::generator::symbolic_vector<1,NumericT,Alignment> symv2;
-
-    viennacl::generator::cpu_symbolic_scalar<1,NumericT> cpu_sym_scal2;
-    viennacl::generator::gpu_symbolic_scalar<1,NumericT> gpu_sym_scal2;
-
-    viennacl::generator::cpu_symbolic_scalar<2,NumericT> cpu_sym_scal3;
-    viennacl::generator::gpu_symbolic_scalar<2,NumericT> gpu_sym_scal3;
-
-	
-    if ( !readVectorFromFile<NumericT> ( vecfile, vec ) ) {
-        std::cout << "Error reading vec file" << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-
-    std::cout << "Running tests for vector of size " << vec.size() << std::endl;
-	std::cout << "----- Alignment " << Alignment << " -----" << std::endl;
-	
-	viennacl::vector<NumericT,Alignment> vcl_vec ( vec.size() );
-    viennacl::vector<NumericT,Alignment> vcl_vec2 ( vec.size() );
-	
-    vec2 = vec;
-    viennacl::copy ( vec.begin(), vec.end(), vcl_vec.begin() );
-    viennacl::copy ( vec2.begin(), vec2.end(), vcl_vec2.begin() );
-
-    // --------------------------------------------------------------------------
-
-    
-    std::cout << "testing addition..." << std::endl;
-    vec     = ( vec - vec2 );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv - symv2 ) ( vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "Testing inplace addition..." << std::endl;
-    vec     += vec2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv += symv2 ) ( vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing substraction..." << std::endl;
-    vec     = vec - vec2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv - symv2 ) ( vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: substraction" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "Testing inplace substraction..." << std::endl;
-    vec     -= vec2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv -= symv2 ) ( vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    // --------------------------------------------------------------------------
-
-    std::cout << "testing cpu scalar multiplication ..." << std::endl;
-    vec     = vec*cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv*cpu_sym_scal2 ) ( vcl_vec, cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: cpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace cpu scalar multiplication ..." << std::endl;
-    vec     *= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv *= cpu_sym_scal2 ) ( vcl_vec, cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace cpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing cpu scalar division ..." << std::endl;
-    vec     = vec/cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv/cpu_sym_scal2 ) ( vcl_vec, cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: cpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace cpu scalar division ..." << std::endl;
-    vec     /= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv /= cpu_sym_scal2 ) ( vcl_vec, cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace cpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing gpu scalar multiplication ..." << std::endl;
-    vec     = vec*cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv*gpu_sym_scal2 ) ( vcl_vec, gpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: gpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing cpu and gpu scalar multiplication ..." << std::endl;
-    vec     = cpu_scal*vec*cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = cpu_sym_scal2*symv*gpu_sym_scal3 ) ( vcl_vec, cpu_scal, gpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: cpu and gpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace gpu scalar multiplication ..." << std::endl;
-    vec     *= cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv *= gpu_sym_scal2 ) ( vcl_vec, gpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace gpu scalar multiplication " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing gpu scalar division ..." << std::endl;
-    vec     = vec/cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv/gpu_sym_scal2 ) ( vcl_vec, gpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: gpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing inplace gpu scalar division ..." << std::endl;
-    vec     /=cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv /= gpu_sym_scal2 ) ( vcl_vec, gpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: inplace gpu scalar division " << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    // --------------------------------------------------------------------------
-
-    std::cout << "testing addition scalar multiplication..." << std::endl;
-    vec     = vec + cpu_scal*vec2;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + cpu_sym_scal3*symv2 ) ( vcl_vec, vcl_vec2, cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: addition scalar multiplication" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-
-    // --------------------------------------------------------------------------
-    std::cout << "testing multiple addition..." << std::endl;
-    vec     = vec + vec2 + vec;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + symv2 + symv ) ( vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: multiple addition" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing substraction with parenthesis" << std::endl;
-    vec     = vec - ( vec2 - vec );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv - ( symv2 - symv ) ) ( vcl_vec, vcl_vec2 ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: substraction with parenthesis" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing tree expansion right minus" << std::endl;
-    vec     = vec + cpu_scal* ( vec2 - vec );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + cpu_sym_scal3* ( symv2 - symv ) ) ( vcl_vec, vcl_vec2,cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: tree expansion right minus" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing tree expansion right plus" << std::endl;
-    vec     = vec + cpu_scal* ( vec2 + vec );
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + cpu_sym_scal3* ( symv2 + symv ) ) ( vcl_vec, vcl_vec2,cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: tree expansion right plus" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing tree expansion left minus" << std::endl;
-    vec     = vec + ( vec2 - vec ) *cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + ( symv2 - symv ) *cpu_sym_scal3 ) ( vcl_vec, vcl_vec2,cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: tree expansion left minus" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    std::cout << "testing tree expansion left plus" << std::endl;
-    vec     = vec + ( vec2 + vec ) *cpu_scal;
-    viennacl::ocl::enqueue ( viennacl::generator::custom_operation ( symv = symv + ( symv2 + symv ) *cpu_sym_scal3 ) ( vcl_vec, vcl_vec2,cpu_scal ) );
-    if ( fabs ( diff ( vec, vcl_vec ) ) > epsilon ) {
-        std::cout << "# Error at operation: tree expansion left plus" << std::endl;
-        std::cout << "  diff: " << fabs ( diff ( vec, vcl_vec ) ) << std::endl;
-        retval = EXIT_FAILURE;
-    }
-
-    return retval;
-}
-
-
-int main() {
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "## Test :: Vector" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-
-    int retval = EXIT_SUCCESS;
-
-    std::string vecfile ( "../examples/testdata/rhs65025.txt" );
-    std::string resultfile ( "../examples/testdata/result65025.txt" );
-
-    std::cout << std::endl;
-    std::cout << "----------------------------------------------" << std::endl;
-    std::cout << std::endl;
-    {
-        typedef float NumericT;
-        NumericT epsilon = 1.0E-4;
-        std::cout << "# Testing setup:" << std::endl;
-        std::cout << "  eps:     " << epsilon << std::endl;
-        std::cout << "  numeric: float" << std::endl;
-        retval = test<NumericT,1> ( epsilon, vecfile, resultfile );
-// 		retval &= test<NumericT,2> ( epsilon, vecfile, resultfile );
-		retval &= test<NumericT,4> ( epsilon, vecfile, resultfile );
-// 		retval &= test<NumericT,8> ( epsilon, vecfile, resultfile );
-		retval &= test<NumericT,16> ( epsilon, vecfile, resultfile );
-        if ( retval == EXIT_SUCCESS )
-            std::cout << "# Test passed" << std::endl;
-        else
-            return retval;
-    }
-}
diff --git a/tests/src/matrix.cpp b/tests/src/matrix.cpp
deleted file mode 100644
index 0108760..0000000
--- a/tests/src/matrix.cpp
+++ /dev/null
@@ -1,533 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-
-//
-// *** System
-//
-#include <iostream>
-
-//
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-//
-// *** ViennaCL
-//
-//#define VIENNACL_DEBUG_ALL
-#define VIENNACL_HAVE_UBLAS 1
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "examples/tutorial/Random.hpp"
-
-//
-// -------------------------------------------------------------
-//
-using namespace boost::numeric;
-//
-// -------------------------------------------------------------
-//
-template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
-{
-   if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
-   return 0;
-}
-
-template <typename ScalarType>
-ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   for (unsigned int i=0;i<v1.size(); ++i)
-   {
-      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
-         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
-      else
-         v2_cpu[i] = 0.0;
-   }
-
-   return norm_inf(v2_cpu);
-}
-
-template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-ScalarType diff(ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2)
-{
-   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
-   copy(mat2, mat2_cpu);
-   ScalarType ret = 0;
-   ScalarType act = 0;
-
-    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
-    {
-      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
-      {
-         act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
-         if (act > ret)
-           ret = act;
-      }
-    }
-   //std::cout << ret << std::endl;
-   return ret;
-}
-
-//
-// -------------------------------------------------------------
-//
-template< typename NumericT, typename F, typename Epsilon >
-int test(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   
-   std::size_t num_rows = 121;
-   std::size_t num_cols = 103;
-   
-   // --------------------------------------------------------------------------            
-   ublas::vector<NumericT> rhs(num_rows);
-   for (unsigned int i = 0; i < rhs.size(); ++i)
-     rhs(i) = random<NumericT>();
-   ublas::vector<NumericT> rhs2 = rhs;
-   ublas::vector<NumericT> result = ublas::scalar_vector<NumericT>(num_cols, NumericT(3.1415));
-   ublas::vector<NumericT> result2 = result;
-   ublas::vector<NumericT> rhs_trans = rhs;
-   rhs_trans.resize(result.size(), true);
-   ublas::vector<NumericT> result_trans = ublas::zero_vector<NumericT>(rhs.size());
-
-  
-   ublas::matrix<NumericT> matrix(result.size(), rhs.size());
-  
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-      for (unsigned int j = 0; j < matrix.size2(); ++j)
-         matrix(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   viennacl::vector<NumericT> vcl_rhs(rhs.size());
-   viennacl::vector<NumericT> vcl_rhs_trans(rhs_trans.size());
-   viennacl::vector<NumericT> vcl_result_trans(result_trans.size());
-   viennacl::vector<NumericT> vcl_result(result.size()); 
-   viennacl::matrix<NumericT, F> vcl_matrix(result.size(), rhs.size());
-
-   viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   viennacl::copy(result, vcl_result);
-   viennacl::copy(matrix, vcl_matrix);
-   
-   std::cout << "Matrix resizing (to larger)" << std::endl;
-   matrix.resize(2*num_rows, 2*num_cols, true);
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-   {
-      for (unsigned int j = (i<result.size() ? rhs.size() : 0); j < matrix.size2(); ++j)
-         matrix(i,j) = 0;
-   }
-   vcl_matrix.resize(2*num_rows, 2*num_cols, true);
-   viennacl::copy(vcl_matrix, matrix);
-   if( fabs(diff(matrix, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix resize (to larger)" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-   
-   matrix(12, 14) = NumericT(1.9);
-   matrix(19, 16) = NumericT(1.0);
-   matrix (13, 15) =  NumericT(-9);
-   vcl_matrix(12, 14) = NumericT(1.9);
-   vcl_matrix(19, 16) = NumericT(1.0);
-   vcl_matrix (13, 15) =  NumericT(-9);
-   
-   std::cout << "Matrix resizing (to smaller)" << std::endl;
-   matrix.resize(result.size(), rhs.size(), true);
-   vcl_matrix.resize(result.size(), rhs.size(), true);
-   if( fabs(diff(matrix, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix resize (to smaller)" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-
-
-   std::cout << "Matrix addition and subtraction" << std::endl;
-   viennacl::matrix<NumericT, F> vcl_matrix2 = vcl_matrix;
-   vcl_matrix2 += vcl_matrix;
-   vcl_matrix2 -= vcl_matrix;
-   vcl_matrix2 = vcl_matrix2 + vcl_matrix;
-   vcl_matrix2 = vcl_matrix2 - vcl_matrix;
-
-   if( fabs(diff(matrix, vcl_matrix2)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix addition and subtraction" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix, vcl_matrix2)) << std::endl;
-      return EXIT_FAILURE;
-   }
-
-   // --------------------------------------------------------------------------            
-   std::cout << "Rank 1 update" << std::endl;
-   ublas::matrix<NumericT> matrix2 = matrix;
-   
-   matrix2 += ublas::outer_prod(result, rhs);
-   vcl_matrix += viennacl::linalg::outer_prod(vcl_result, vcl_rhs);
-   if( fabs(diff(matrix2, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: rank 1 update" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix2, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-   std::cout << "Scaled rank 1 update" << std::endl;
-   matrix2 += 4.2f * ublas::outer_prod(result, rhs);
-   vcl_matrix += 2.1f * viennacl::linalg::outer_prod(vcl_result, vcl_rhs);
-   vcl_matrix += viennacl::linalg::outer_prod(vcl_result, vcl_rhs) * 2.1f;  //check proper compilation
-   if( fabs(diff(matrix2, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: scaled rank 1 update" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix2, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-   
-   //reset vcl_matrix:
-   viennacl::copy(matrix, vcl_matrix);
-   
-   // --------------------------------------------------------------------------            
-   std::cout << "Matrix-Vector product" << std::endl;
-   result     = viennacl::linalg::prod(matrix, rhs);
-   vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_rhs);
-   
-   for (std::size_t i=0; i<result.size(); ++i)
-   {
-     std::cout << rhs(i) << ", " << vcl_rhs(i) << ", " << result(i) << ", " << vcl_result(i) << std::endl; 
-   }
-   
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-   std::cout << "Matrix-Vector product with scaled add" << std::endl;
-   NumericT alpha = static_cast<NumericT>(2.786);
-   NumericT beta = static_cast<NumericT>(1.432);
-   viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   viennacl::copy(result.begin(), result.end(), vcl_result.begin());
-
-   result     = alpha * viennacl::linalg::prod(matrix, rhs) + beta * result;
-   vcl_result = alpha * viennacl::linalg::prod(vcl_matrix, vcl_rhs) + beta * vcl_result;
-
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-
-   viennacl::copy(rhs_trans.begin(), rhs_trans.end(), vcl_rhs_trans.begin());
-   viennacl::copy(result_trans.begin(), result_trans.end(), vcl_result_trans.begin());
-
-   std::cout << "Transposed Matrix-Vector product" << std::endl;
-   result_trans     = alpha * viennacl::linalg::prod(trans(matrix), rhs_trans);  
-   vcl_result_trans = alpha * viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans);
-
-   if( fabs(diff(result_trans, vcl_result_trans)) > epsilon )
-   {
-      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result_trans, vcl_result_trans)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
-   result_trans     = alpha * viennacl::linalg::prod(trans(matrix), rhs_trans) + beta * result_trans;  
-   vcl_result_trans = alpha * viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans) + beta * vcl_result_trans;
-
-   if( fabs(diff(result_trans, vcl_result_trans)) > epsilon )
-   {
-      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result_trans, vcl_result_trans)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-
-   /////////////////// test direct solvers ////////////////////////////
-   
-   rhs.resize(40);
-   matrix.resize(rhs.size(), rhs.size());
-   result.resize(rhs.size());
-
-   std::cout << "Resizing vcl_rhs..." << std::endl;
-   vcl_rhs.resize(rhs.size());
-   std::cout << "Resizing vcl_rhs done" << std::endl;
-   vcl_matrix.resize(rhs.size(), rhs.size());
-   std::cout << "Resizing vcl_result..." << std::endl;
-   vcl_result.resize(rhs.size());
-   std::cout << "Resizing vcl_result done" << std::endl;
-
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-   {
-      for (unsigned int j = 0; j < matrix.size2(); ++j)
-         matrix(i,j) = -random<NumericT>();
-      rhs(i) = random<NumericT>();
-   }
-
-   //force unit diagonal
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-      matrix(i,i) = static_cast<NumericT>(3) + random<NumericT>();
-
-   viennacl::copy(matrix, vcl_matrix);
-   viennacl::copy(rhs, vcl_rhs);
-
-   //upper triangular:
-   std::cout << "Upper triangular solver" << std::endl;
-   result = ublas::solve(matrix, rhs, ublas::upper_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //upper unit triangular:
-   std::cout << "Upper unit triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(matrix, rhs, ublas::unit_upper_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::unit_upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //lower triangular:
-   std::cout << "Lower triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(matrix, rhs, ublas::lower_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //lower unit triangular:
-   std::cout << "Lower unit triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(matrix, rhs, ublas::unit_lower_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::unit_lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-
-
-
-
-   //transposed upper triangular:
-   std::cout << "Transposed upper triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::upper_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //transposed upper unit triangular:
-   std::cout << "Transposed unit upper triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::unit_upper_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::unit_upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //transposed lower triangular:
-   std::cout << "Transposed lower triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::lower_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //transposed lower unit triangular:
-   std::cout << "Transposed unit lower triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::unit_lower_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::unit_lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   
-   //full solver:
-   std::cout << "Full solver" << std::endl;
-   unsigned int lu_dim = 100;
-   ublas::matrix<NumericT> square_matrix(lu_dim, lu_dim);
-   ublas::vector<NumericT> lu_rhs(lu_dim);
-   viennacl::matrix<NumericT, F> vcl_square_matrix(lu_dim, lu_dim);
-   viennacl::vector<NumericT> vcl_lu_rhs(lu_dim);
-
-   for (std::size_t i=0; i<lu_dim; ++i)
-     for (std::size_t j=0; j<lu_dim; ++j)
-       square_matrix(i,j) = -static_cast<NumericT>(0.5) * random<NumericT>();
-
-   //put some more weight on diagonal elements:
-   for (std::size_t j=0; j<lu_dim; ++j)
-   {
-     square_matrix(j,j) = static_cast<NumericT>(20.0) + random<NumericT>();
-     lu_rhs(j) = random<NumericT>();
-   }
-   
-   viennacl::copy(square_matrix, vcl_square_matrix);
-   viennacl::copy(lu_rhs, vcl_lu_rhs);
-   
-   //ublas::
-   ublas::lu_factorize(square_matrix);
-   ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
-   ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
-
-   // ViennaCL:
-   viennacl::linalg::lu_factorize(vcl_square_matrix);
-   //viennacl::copy(square_matrix, vcl_square_matrix);
-   viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
-
-   if( fabs(diff(lu_rhs, vcl_lu_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: dense solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(lu_rhs, vcl_lu_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   
-
-   return retval;
-}
-//
-// -------------------------------------------------------------
-//
-int main()
-{
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "## Test :: Matrix" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-
-   int retval = EXIT_SUCCESS;
-
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = NumericT(1.0E-3);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      std::cout << "  layout: row-major" << std::endl;
-      retval = test<NumericT, viennacl::row_major>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = NumericT(1.0E-3);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      std::cout << "  layout: column-major" << std::endl;
-      retval = test<NumericT, viennacl::column_major>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   
-   
-   if( viennacl::ocl::current_device().double_support() )
-   {
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-11;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         std::cout << "  layout: row-major" << std::endl;
-         retval = test<NumericT, viennacl::row_major>(epsilon);
-            if( retval == EXIT_SUCCESS )
-               std::cout << "# Test passed" << std::endl;
-            else
-              return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-11;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         std::cout << "  layout: column-major" << std::endl;
-         retval = test<NumericT, viennacl::column_major>(epsilon);
-            if( retval == EXIT_SUCCESS )
-               std::cout << "# Test passed" << std::endl;
-            else
-              return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-   }
-   return retval;
-}
diff --git a/tests/src/matrix_range.cpp b/tests/src/matrix_range.cpp
deleted file mode 100644
index c53470b..0000000
--- a/tests/src/matrix_range.cpp
+++ /dev/null
@@ -1,558 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-//#define NDEBUG
-//#define VIENNACL_BUILD_INFO
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <time.h>
-//#include "../benchmarks/benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-/*#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"*/
-#include "viennacl/matrix_proxy.hpp"
-#include "viennacl/vector_proxy.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/matrix_proxy.hpp"
-#include "boost/numeric/ublas/vector_proxy.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-           
-template <typename VectorType, typename VCLVectorType>
-bool check_for_equality_vector(VectorType const & ublas_v, VCLVectorType const & vcl_v)
-{
-  typedef typename VectorType::value_type   value_type;
-  
-  boost::numeric::ublas::vector<value_type> vcl_v_cpu(vcl_v.size());
-  viennacl::copy(vcl_v, vcl_v_cpu);
-  
-  for (std::size_t i=0; i<ublas_v.size(); ++i)
-  {
-    if (ublas_v(i) != vcl_v_cpu(i))
-    {
-      if ( std::abs(ublas_v(i) - vcl_v_cpu(i)) / std::max(ublas_v(i), vcl_v_cpu(i)) > 1e-5 ) 
-      {
-        std::cout << "Error at index (" << i << "): " << ublas_v(i) << " vs " << vcl_v_cpu(i) << std::endl;
-        std::cout << ublas_v << std::endl;
-        std::cout << vcl_v_cpu << std::endl;
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-
-template <typename MatrixType, typename VCLMatrixType>
-bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A)
-{
-  typedef typename MatrixType::value_type   value_type;
-  
-  boost::numeric::ublas::matrix<value_type> vcl_A_cpu(vcl_A.size1(), vcl_A.size2());
-  viennacl::copy(vcl_A, vcl_A_cpu);
-  
-  for (std::size_t i=0; i<ublas_A.size1(); ++i)
-  {
-    for (std::size_t j=0; j<ublas_A.size2(); ++j)
-    {
-      if (ublas_A(i,j) != vcl_A_cpu(i,j))
-      {
-        if ( std::abs(ublas_A(i,j) - vcl_A_cpu(i,j)) / std::max(ublas_A(i,j), vcl_A_cpu(i,j)) > 1e-5 ) 
-        {
-          std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
-          std::cout << ublas_A << std::endl;
-          std::cout << vcl_A_cpu << std::endl;
-          return false;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-
-
-           
-template <typename T, typename ScalarType>
-int run_test()
-{
-    //typedef float               ScalarType;
-    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
-    typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-    
-    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
-    typedef viennacl::vector<ScalarType>       VCLVectorType;
-    
-    viennacl::scalar<ScalarType> gpu_pi = ScalarType(3.1415);
-    
-    std::size_t dim_large = 151;
-    std::size_t dim_small = 37;
-    //std::size_t dim_large = 35;
-    //std::size_t dim_small = 17;
-    
-    //setup ublas objects:
-    MatrixType ublas_A(dim_large, dim_large);
-    for (std::size_t i=0; i<ublas_A.size1(); ++i)
-      for (std::size_t j=0; j<ublas_A.size2(); ++j)
-        ublas_A(i,j) = ScalarType((i+1) + (j+1)*(i+1));
-
-    MatrixType ublas_B(dim_small, dim_small);
-    for (std::size_t i=0; i<ublas_B.size1(); ++i)
-      for (std::size_t j=0; j<ublas_B.size2(); ++j)
-        ublas_B(i,j) = ScalarType((i+1) + (j+1)*(i+1));
-
-    MatrixType ublas_C(dim_large, dim_small);
-    for (std::size_t i=0; i<ublas_C.size1(); ++i)
-      for (std::size_t j=0; j<ublas_C.size2(); ++j)
-        ublas_C(i,j) = ScalarType((j+2) + (j+1)*(i+1));
-
-    MatrixType ublas_D(dim_small, dim_large);
-    for (std::size_t i=0; i<ublas_D.size1(); ++i)
-      for (std::size_t j=0; j<ublas_D.size2(); ++j)
-        ublas_D(i,j) = ScalarType((j+2) + (j+1)*(i+1));
-      
-    boost::numeric::ublas::range ublas_r1(0, dim_small);
-    boost::numeric::ublas::range ublas_r2(dim_large - dim_small, dim_large);
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub1(ublas_A, ublas_r1, ublas_r1);
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub2(ublas_A, ublas_r2, ublas_r2);
-
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_D_sub(ublas_D, ublas_r1, ublas_r1);
-
-    //Setup ViennaCL objects    
-    VCLMatrixType vcl_A(dim_large, dim_large);
-    viennacl::copy(ublas_A, vcl_A);
-    VCLMatrixType vcl_B(dim_small, dim_small);
-    viennacl::copy(ublas_B, vcl_B);
-    VCLMatrixType vcl_C(dim_large, dim_small);
-    viennacl::copy(ublas_C, vcl_C);
-    VCLMatrixType vcl_D(dim_small, dim_large);
-    viennacl::copy(ublas_D, vcl_D);
-    
-    viennacl::range vcl_r1(0, dim_small);
-    viennacl::range vcl_r2(dim_large - dim_small, dim_large);
-    viennacl::matrix_range<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_r1, vcl_r1);
-    viennacl::matrix_range<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_r2, vcl_r2);
-    
-    viennacl::matrix_range<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
-    viennacl::matrix_range<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_r1, vcl_r1);
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    ublas_A_sub1 = ublas_B;
-    viennacl::copy(ublas_B, vcl_A_sub1);
-    std::cout << "Testing upper left copy to A... ";
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    ublas_A_sub2 = ublas_B;
-    viennacl::copy(ublas_B, vcl_A_sub2);
-    std::cout << "Testing lower right copy to A... ";
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    
-    ublas_C_sub = ublas_B;
-    viennacl::copy(ublas_B, vcl_C_sub);
-    std::cout << "Testing upper copy to C... ";
-    if (check_for_equality(ublas_C, vcl_C))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    
-    ublas_D_sub = ublas_B;
-    viennacl::copy(ublas_B, vcl_D_sub);
-    std::cout << "Testing left copy to D... ";
-    if (check_for_equality(ublas_D, vcl_D))
-      std::cout << "PASSED!" << std::endl;
-    else
-      std::cout << std::endl << "TEST failed!";
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing upper left copy to A... ";
-    if (check_for_equality(ublas_A_sub1, vcl_A_sub1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing lower right copy to A... ";
-    if (check_for_equality(ublas_A_sub2, vcl_A_sub2))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing upper copy to C... ";
-    if (check_for_equality(ublas_C_sub, vcl_C_sub))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing left copy to D... ";
-    if (check_for_equality(ublas_D_sub, vcl_D_sub))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 3: Addition //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
-    
-    std::cout << "Inplace add to submatrix: ";
-    ublas_A_sub2 += ublas_A_sub2;
-    vcl_A_sub2 += vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add to matrix: ";
-    ublas_B += ublas_A_sub2;
-    vcl_B += vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Add to submatrix: ";
-    ublas_A_sub2 = ublas_A_sub2 + ublas_A_sub2;
-    vcl_A_sub2 = vcl_A_sub2 + vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Add to matrix: ";
-    ublas_B = ublas_A_sub2 + ublas_A_sub2;
-    vcl_B = vcl_A_sub2 + vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
-    
-    std::cout << "Inplace add to submatrix: ";
-    ublas_A_sub2 -= ublas_A_sub2;
-    vcl_A_sub2 -= vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add to matrix: ";
-    ublas_B -= ublas_A_sub2;
-    vcl_B -= vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Add to submatrix: ";
-    ublas_A_sub2 = ublas_A_sub2 - ublas_A_sub2;
-    vcl_A_sub2 = vcl_A_sub2 - vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Add to matrix: ";
-    ublas_B = ublas_A_sub2 - ublas_A_sub2;
-    vcl_B = vcl_A_sub2 - vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 5: Scaling //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A, vcl_A);
-    
-    std::cout << "Multiplication with CPU scalar: ";
-    ublas_A_sub2 *= ScalarType(3.1415);
-    vcl_A_sub2 *= ScalarType(3.1415);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Multiplication with GPU scalar: ";
-    ublas_A_sub2 *= gpu_pi;
-    vcl_A_sub2 *= gpu_pi;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "Division with CPU scalar: ";
-    ublas_A_sub2 /= ScalarType(3.1415);
-    vcl_A_sub2 /= ScalarType(3.1415);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Division with GPU scalar: ";
-    ublas_A_sub2 /= gpu_pi;
-    vcl_A_sub2 /= gpu_pi;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 6: Matrix-Matrix Products //////////" << std::endl;
-    std::cout << "//" << std::endl;
-
-    std::cout << "Assigned C = A * B: ";
-    ublas_A_sub1 = prod(ublas_C_sub, ublas_D_sub);
-    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A^T * B: ";
-    ublas_A_sub1 = prod(trans(ublas_C_sub), ublas_D_sub);
-    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A * B^T: ";
-    ublas_A_sub1 = prod(ublas_C_sub, trans(ublas_D_sub));
-    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, trans(vcl_D_sub));
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A^T * B^T: ";
-    ublas_A_sub1 = prod(trans(ublas_C_sub), trans(ublas_D_sub));
-    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), trans(vcl_D_sub));
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add of prod(): ";
-    ublas_A_sub1 += prod(ublas_C_sub, ublas_D_sub);
-    vcl_A_sub1 += viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 7: Matrix-Vector Products //////////" << std::endl;
-    std::cout << "//" << std::endl;
-
-    VectorType ublas_v1(dim_large);
-    for (std::size_t i=0; i<ublas_v1.size(); ++i)
-      ublas_v1(i) = i;
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub(ublas_v1, ublas_r1);
-
-    VectorType ublas_v2(dim_large);
-    for (std::size_t i=0; i<ublas_v2.size(); ++i)
-      ublas_v2(i) = i - 5;
-    boost::numeric::ublas::vector_range<VectorType> ublas_v2_sub(ublas_v2, ublas_r1);
-
-    
-    VCLVectorType vcl_v1(ublas_v1.size());
-    viennacl::vector_range<VCLVectorType> vcl_v1_sub(vcl_v1, vcl_r1);
-    VCLVectorType vcl_v2(ublas_v2.size());
-    viennacl::vector_range<VCLVectorType> vcl_v2_sub(vcl_v2, vcl_r1);
-    viennacl::copy(ublas_v1, vcl_v1);
-    viennacl::copy(ublas_v2, vcl_v2);
-    viennacl::copy(ublas_A_sub1, vcl_A_sub1);
-    
-    
-    ublas_v2_sub = prod(ublas_A_sub1, ublas_v1_sub);
-    vcl_v2_sub = viennacl::linalg::prod(vcl_A_sub1, vcl_v1_sub);
-
-    if (check_for_equality_vector(ublas_v2, vcl_v2))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-
-    return EXIT_SUCCESS;
-}    
-
-int main (int argc, const char * argv[])
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Test :: Matrix Range" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-   
-  std::cout << "# Testing setup:" << std::endl;
-  std::cout << "  eps:     " << 0 << std::endl;
-  std::cout << "  numeric: float" << std::endl;
-  if (run_test<viennacl::row_major, float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  if (run_test<viennacl::column_major, float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  
-  
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << "# Testing setup:" << std::endl;
-    std::cout << "  eps:     " << 0 << std::endl;
-    std::cout << "  numeric: double" << std::endl;
-    
-    if (run_test<viennacl::row_major, double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-    if (run_test<viennacl::column_major, double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-  }
-
-  return EXIT_SUCCESS;
-}
-
diff --git a/tests/src/matrix_slice.cpp b/tests/src/matrix_slice.cpp
deleted file mode 100644
index acdc09b..0000000
--- a/tests/src/matrix_slice.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-//#define NDEBUG
-//#define VIENNACL_BUILD_INFO
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <time.h>
-//#include "../benchmarks/benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-/*#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"*/
-#include "viennacl/matrix_proxy.hpp"
-#include "viennacl/vector_proxy.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/matrix_proxy.hpp"
-#include "boost/numeric/ublas/vector_proxy.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-           
-template <typename VectorType, typename VCLVectorType>
-bool check_for_equality_vector(VectorType const & ublas_v, VCLVectorType const & vcl_v)
-{
-  typedef typename VectorType::value_type   value_type;
-  
-  boost::numeric::ublas::vector<value_type> vcl_v_cpu(vcl_v.size());
-  viennacl::copy(vcl_v, vcl_v_cpu);
-  
-  for (std::size_t i=0; i<ublas_v.size(); ++i)
-  {
-    if (ublas_v(i) != vcl_v_cpu(i))
-    {
-      if ( std::abs(ublas_v(i) - vcl_v_cpu(i)) / std::max(ublas_v(i), vcl_v_cpu(i)) > 1e-5 ) 
-      {
-        std::cout << "Error at index (" << i << "): " << ublas_v(i) << " vs " << vcl_v_cpu(i) << std::endl;
-        std::cout << ublas_v << std::endl;
-        std::cout << vcl_v_cpu << std::endl;
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-
-template <typename MatrixType, typename VCLMatrixType>
-bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A)
-{
-  typedef typename MatrixType::value_type   value_type;
-  
-  boost::numeric::ublas::matrix<value_type> vcl_A_cpu(vcl_A.size1(), vcl_A.size2());
-  viennacl::copy(vcl_A, vcl_A_cpu);
-  
-  for (std::size_t i=0; i<ublas_A.size1(); ++i)
-  {
-    for (std::size_t j=0; j<ublas_A.size2(); ++j)
-    {
-      if (ublas_A(i,j) != vcl_A_cpu(i,j))
-      {
-        if ( std::abs(ublas_A(i,j) - vcl_A_cpu(i,j)) / std::max(ublas_A(i,j), vcl_A_cpu(i,j)) > 1e-5 ) 
-        {
-          std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
-          std::cout << ublas_A << std::endl;
-          std::cout << vcl_A_cpu << std::endl;
-          return false;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-
-           
-template <typename T, typename ScalarType>
-int run_test()
-{
-    //typedef float               ScalarType;
-    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
-    typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-    
-    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
-    typedef viennacl::vector<ScalarType>       VCLVectorType;
-    
-    viennacl::scalar<ScalarType> gpu_pi = ScalarType(3.1415);
-    
-    //std::size_t dim_large = 196;
-    //std::size_t dim_small = 64;
-    //std::size_t dim_large = 75;  //Note: ensure dim_large > 2 * dim_small
-    //std::size_t dim_small = 34;
-
-    std::size_t dim_large = 75;  //Note: ensure dim_large > 2 * dim_small
-    std::size_t dim_small = 34;
-    
-    //setup ublas objects:
-    MatrixType ublas_A(dim_large, dim_large);
-    for (std::size_t i=0; i<ublas_A.size1(); ++i)
-      for (std::size_t j=0; j<ublas_A.size2(); ++j)
-        ublas_A(i,j) = ScalarType((i+1) + (j+1)*(i+1));
-
-    MatrixType ublas_B(dim_small, dim_small);
-    for (std::size_t i=0; i<ublas_B.size1(); ++i)
-      for (std::size_t j=0; j<ublas_B.size2(); ++j)
-        ublas_B(i,j) = ScalarType((i+1) + (j+1)*(i+1));
-
-    MatrixType ublas_C(dim_large, 2 * dim_small);
-    for (std::size_t i=0; i<ublas_C.size1(); ++i)
-      for (std::size_t j=0; j<ublas_C.size2(); ++j)
-        ublas_C(i,j) = ScalarType((j+2) + (j+1)*(i+1));
-
-    MatrixType ublas_D(2 * dim_small, dim_large);
-    for (std::size_t i=0; i<ublas_D.size1(); ++i)
-      for (std::size_t j=0; j<ublas_D.size2(); ++j)
-        ublas_D(i,j) = ScalarType((j+2) + (j+1)*(i+1));
-      
-    boost::numeric::ublas::slice ublas_s1(0, 2, dim_small);
-    boost::numeric::ublas::slice ublas_s2(dim_large - 2 * dim_small, 2, dim_small);
-    boost::numeric::ublas::matrix_slice<MatrixType> ublas_A_sub1(ublas_A, ublas_s1, ublas_s1);
-    boost::numeric::ublas::matrix_slice<MatrixType> ublas_A_sub2(ublas_A, ublas_s2, ublas_s2);
-
-    boost::numeric::ublas::matrix_slice<MatrixType> ublas_C_sub(ublas_C, ublas_s1, ublas_s1);
-    boost::numeric::ublas::matrix_slice<MatrixType> ublas_D_sub(ublas_D, ublas_s1, ublas_s1);
-
-    //Setup ViennaCL objects    
-    VCLMatrixType vcl_A(dim_large, dim_large);
-    viennacl::copy(ublas_A, vcl_A);
-    VCLMatrixType vcl_B(dim_small, dim_small);
-    viennacl::copy(ublas_B, vcl_B);
-    VCLMatrixType vcl_C(dim_large, 2 * dim_small);
-    viennacl::copy(ublas_C, vcl_C);
-    VCLMatrixType vcl_D(2 * dim_small, dim_large);
-    viennacl::copy(ublas_D, vcl_D);
-    
-    viennacl::slice vcl_s1(0, 2, dim_small);
-    viennacl::slice vcl_s2(dim_large - 2 * dim_small, 2, dim_small);
-    viennacl::matrix_slice<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_s1, vcl_s1);
-    viennacl::matrix_slice<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_s2, vcl_s2);
-    
-    viennacl::matrix_slice<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_s1, vcl_s1);
-    viennacl::matrix_slice<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_s1, vcl_s1);
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing upper left copy to A... ";
-    ublas_A_sub1 = ublas_B;
-    viennacl::copy(ublas_B, vcl_A_sub1);
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "Testing lower right copy to A... ";
-    ublas_A_sub2 = ublas_B;
-    viennacl::copy(ublas_B, vcl_A_sub2);
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    
-    std::cout << "Testing upper copy to C... ";
-    ublas_C_sub = ublas_B;
-    viennacl::copy(ublas_B, vcl_C_sub);
-    if (check_for_equality(ublas_C, vcl_C))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    
-    std::cout << "Testing left copy to D... ";
-    ublas_D_sub = ublas_B;
-    viennacl::copy(ublas_B, vcl_D_sub);
-    if (check_for_equality(ublas_D, vcl_D))
-      std::cout << "PASSED!" << std::endl;
-    else
-      std::cout << std::endl << "TEST failed!";
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing upper left copy to A... ";
-    if (check_for_equality(ublas_A_sub1, vcl_A_sub1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing lower right copy to A... ";
-    if (check_for_equality(ublas_A_sub2, vcl_A_sub2))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing upper copy to C... ";
-    if (check_for_equality(ublas_C_sub, vcl_C_sub))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing left copy to D... ";
-    if (check_for_equality(ublas_D_sub, vcl_D_sub))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 3: Addition //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
-    
-    std::cout << "Inplace add to submatrix: ";
-    ublas_A_sub2 += ublas_A_sub2;
-    vcl_A_sub2 += vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add to matrix: ";
-    ublas_B += ublas_A_sub2;
-    vcl_B += vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Add to submatrix: ";
-    ublas_A_sub2 = ublas_A_sub2 + ublas_A_sub2;
-    vcl_A_sub2 = vcl_A_sub2 + vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Add to matrix: ";
-    ublas_B = ublas_A_sub2 + ublas_A_sub2;
-    vcl_B = vcl_A_sub2 + vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
-    
-    std::cout << "Inplace add to submatrix: ";
-    ublas_A_sub2 -= ublas_A_sub2;
-    vcl_A_sub2 -= vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add to matrix: ";
-    ublas_B -= ublas_A_sub2;
-    vcl_B -= vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Add to submatrix: ";
-    ublas_A_sub2 = ublas_A_sub2 - ublas_A_sub2;
-    vcl_A_sub2 = vcl_A_sub2 - vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Add to matrix: ";
-    ublas_B = ublas_A_sub2 - ublas_A_sub2;
-    vcl_B = vcl_A_sub2 - vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 5: Scaling //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A, vcl_A);
-    
-    std::cout << "Multiplication with CPU scalar: ";
-    ublas_A_sub2 *= ScalarType(3.1415);
-    vcl_A_sub2 *= ScalarType(3.1415);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Multiplication with GPU scalar: ";
-    ublas_A_sub2 *= gpu_pi;
-    vcl_A_sub2 *= gpu_pi;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "Division with CPU scalar: ";
-    ublas_A_sub2 /= ScalarType(3.1415);
-    vcl_A_sub2 /= ScalarType(3.1415);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Division with GPU scalar: ";
-    ublas_A_sub2 /= gpu_pi;
-    vcl_A_sub2 /= gpu_pi;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 6: Matrix-Matrix Products //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A, vcl_A);
-    viennacl::copy(ublas_B, vcl_B);
-    viennacl::copy(ublas_C, vcl_C);
-
-    std::cout << "Assigned C = A * B: ";
-    ublas_A_sub1 = prod(ublas_C_sub, ublas_D_sub);
-    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A^T * B: ";
-    ublas_A_sub1 = prod(trans(ublas_C_sub), ublas_D_sub);
-    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A * B^T: ";
-    ublas_A_sub1 = prod(ublas_C_sub, trans(ublas_D_sub));
-    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, trans(vcl_D_sub));
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A^T * B^T: ";
-    ublas_A_sub1 = prod(trans(ublas_C_sub), trans(ublas_D_sub));
-    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), trans(vcl_D_sub));
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add of prod(): ";
-    ublas_A_sub1 += prod(ublas_C_sub, ublas_D_sub);
-    vcl_A_sub1 += viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 7: Matrix-Vector Products //////////" << std::endl;
-    std::cout << "//" << std::endl;
-
-    VectorType ublas_v1(dim_large);
-    for (std::size_t i=0; i<ublas_v1.size(); ++i)
-      ublas_v1(i) = static_cast<ScalarType>(i);
-    boost::numeric::ublas::vector_slice<VectorType> ublas_v1_sub(ublas_v1, ublas_s1);
-
-    VectorType ublas_v2(dim_large);
-    for (std::size_t i=0; i<ublas_v2.size(); ++i)
-      ublas_v2(i) = static_cast<ScalarType>(i) - static_cast<ScalarType>(5);
-    boost::numeric::ublas::vector_slice<VectorType> ublas_v2_sub(ublas_v2, ublas_s1);
-
-    
-    VCLVectorType vcl_v1(ublas_v1.size());
-    viennacl::vector_slice<VCLVectorType> vcl_v1_sub(vcl_v1, vcl_s1);
-    VCLVectorType vcl_v2(ublas_v2.size());
-    viennacl::vector_slice<VCLVectorType> vcl_v2_sub(vcl_v2, vcl_s1);
-    viennacl::copy(ublas_v1, vcl_v1);
-    viennacl::copy(ublas_v2, vcl_v2);
-    viennacl::copy(ublas_A_sub1, vcl_A_sub1);
-    
-    
-    ublas_v2_sub = prod(ublas_A_sub1, ublas_v1_sub);
-    vcl_v2_sub = viennacl::linalg::prod(vcl_A_sub1, vcl_v1_sub);
-
-    if (check_for_equality_vector(ublas_v2, vcl_v2))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-
-    return EXIT_SUCCESS;
-}    
-
-int main (int argc, const char * argv[])
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Test :: Matrix Slice" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-   
-  std::cout << "# Testing setup:" << std::endl;
-  std::cout << "  eps:     " << 0 << std::endl;
-  std::cout << "  numeric: float" << std::endl;
-  if (run_test<viennacl::row_major, float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  if (run_test<viennacl::column_major, float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  
-  
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << "# Testing setup:" << std::endl;
-    std::cout << "  eps:     " << 0 << std::endl;
-    std::cout << "  numeric: double" << std::endl;
-    
-    if (run_test<viennacl::row_major, double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-    if (run_test<viennacl::column_major, double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-  }
-
-  return EXIT_SUCCESS;
-}
-
diff --git a/tests/src/vector_range.cpp b/tests/src/vector_range.cpp
deleted file mode 100644
index efdd2ed..0000000
--- a/tests/src/vector_range.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-//#define NDEBUG
-//#define VIENNACL_BUILD_INFO
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <time.h>
-//#include "../benchmarks/benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-/*#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"*/
-#include "viennacl/vector_proxy.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/vector_proxy.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-
-template <typename VectorType, typename VCLVectorType>
-bool check_for_equality(VectorType const & ublas_v, VCLVectorType const & vcl_v)
-{
-  typedef typename VectorType::value_type   value_type;
-  
-  std::vector<value_type> vcl_v_cpu(vcl_v.size());
-  viennacl::copy(vcl_v, vcl_v_cpu);
-
-  bool error_detected = false;
-  for (size_t i=0; i<ublas_v.size(); ++i)
-  {
-    if (ublas_v[i] != vcl_v_cpu[i])
-    {
-      //check whether there are just some round-off errors:
-      if (std::abs(ublas_v[i] - vcl_v_cpu[i]) / std::max(ublas_v[i], vcl_v_cpu[i]) > 1e-5)
-      {
-        std::cout << "Error at index (" << i << "): " << ublas_v[i] << " vs " << vcl_v_cpu[i] << std::endl;
-        error_detected = true;
-      }
-    }
-  }
-  
-  if (!error_detected)
-    std::cout << "PASSED!" << std::endl;
-  else
-  {
-    std::cout << std::endl << "TEST failed!";
-    return EXIT_FAILURE;
-  }
-  
-  return true;
-}
-
-
-           
-template <typename ScalarType>
-int run_test()
-{
-    typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-    
-    typedef viennacl::vector<ScalarType>                    VCLVectorType;
-    
-    std::size_t dim_large = 70;
-    std::size_t dim_small = 27;
-    
-    //setup ublas objects:
-    VectorType ublas_v1(dim_large);
-    for (std::size_t i=0; i<ublas_v1.size(); ++i)
-      ublas_v1(i) = static_cast<ScalarType>(i+1);
-
-    VectorType ublas_v2(dim_small);
-    for (std::size_t i=0; i<ublas_v2.size(); ++i)
-      ublas_v2(i) = static_cast<ScalarType>(dim_large + i);
-      
-    boost::numeric::ublas::range ublas_r1(0, dim_small);
-    boost::numeric::ublas::range ublas_r2(dim_small - 1, 2*dim_small - 1);
-    boost::numeric::ublas::range ublas_r3(dim_large - dim_small, dim_large);
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub1(ublas_v1, ublas_r1);
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub2(ublas_v1, ublas_r2);
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub3(ublas_v1, ublas_r3);
-
-    //Setup ViennaCL objects    
-    VCLVectorType vcl_v1(dim_large);
-    viennacl::copy(ublas_v1, vcl_v1);
-    VCLVectorType vcl_v2(dim_small);
-    viennacl::copy(ublas_v2, vcl_v2);
-    
-    viennacl::range vcl_r1(0, dim_small);
-    viennacl::range vcl_r2(dim_small - 1, 2*dim_small - 1);
-    viennacl::range vcl_r3(dim_large - dim_small, dim_large);
-    viennacl::vector_range<VCLVectorType>   vcl_v1_sub1(vcl_v1, vcl_r1);
-    viennacl::vector_range<VCLVectorType>   vcl_v1_sub2(vcl_v1, vcl_r2);
-    viennacl::vector_range<VCLVectorType>   vcl_v1_sub3(vcl_v1, vcl_r3);
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Copy to GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    ublas_v1_sub1 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub1);
-    std::cout << "Testing copy to begin of v1... ";
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    ublas_v1_sub2 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub2);
-    std::cout << "Testing copy to middle of v1... ";
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    
-    ublas_v1_sub3 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub3);
-    std::cout << "Testing copy to bottom of v1... ";
-    check_for_equality(ublas_v1, vcl_v1);
-
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Copy from GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing beginning of v1... ";
-    check_for_equality(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing middle of v1... ";
-    check_for_equality(ublas_v1_sub2, vcl_v1_sub2);
-    
-    std::cout << "Testing bottom of v1... ";
-    check_for_equality(ublas_v1_sub3, vcl_v1_sub3);
-
-
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Assignments //////////" << std::endl;
-    std::cout << "//" << std::endl;
-
-    viennacl::copy(ublas_v1, vcl_v1);
-    viennacl::copy(ublas_v2, vcl_v2);
-
-    std::cout << "Testing vector assigned to range... ";
-    ublas_v1_sub1 = ublas_v2;
-    vcl_v1_sub1 = vcl_v2;
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Testing range assigned to vector... ";
-    ublas_v2 = ublas_v1_sub1;
-    vcl_v2 = vcl_v1_sub1;
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Testing range assigned to range... ";
-    ublas_v1_sub1 = ublas_v1_sub3;
-    vcl_v1_sub1 = vcl_v1_sub3;
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Inplace add //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing inplace add at beginning of v1: ";
-    ublas_v1_sub1 += ublas_v1_sub1;
-    vcl_v1_sub1 += vcl_v1_sub1;
-
-    check_for_equality(ublas_v1, vcl_v1);
-
-    std::cout << "Testing inplace add at middle of v1: ";
-    ublas_v1_sub2 += ublas_v1_sub2;
-    vcl_v1_sub2 += vcl_v1_sub2;
-    check_for_equality(ublas_v1, vcl_v1);
-
-
-    std::cout << "Testing inplace add at end of v1: ";
-    ublas_v1_sub3 += ublas_v1_sub3;
-    vcl_v1_sub3 += vcl_v1_sub3;
-    check_for_equality(ublas_v1, vcl_v1);
-
-    
-    std::cout << "Testing inplace add at end of v1: ";
-    ublas_v1_sub3 += ublas_v1_sub3;
-    vcl_v1_sub3 += vcl_v1_sub3;
-    check_for_equality(ublas_v1, vcl_v1);
-
-    std::cout << "Testing inplace add of vector with range: ";
-    viennacl::copy(ublas_v2, vcl_v2);
-    ublas_v1_sub2 += ublas_v2;
-    vcl_v1_sub2 += vcl_v2;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Inplace sub //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing inplace sub at beginning of v1: ";
-    ublas_v1_sub1 -= ublas_v1_sub1;
-    vcl_v1_sub1 -= vcl_v1_sub1;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-
-    std::cout << "Testing inplace sub at middle of v1: ";
-    ublas_v1_sub2 -= ublas_v1_sub2;
-    vcl_v1_sub2 -= vcl_v1_sub2;
-
-    check_for_equality(ublas_v1, vcl_v1);
-
-
-    std::cout << "Testing inplace sub at end of v1: ";
-    ublas_v1_sub3 -= ublas_v1_sub3;
-    vcl_v1_sub3 -= vcl_v1_sub3;
-
-    check_for_equality(ublas_v1, vcl_v1);
-
-    std::cout << "Testing inplace sub of vector with range: ";
-    viennacl::copy(ublas_v2, vcl_v2);
-    ublas_v1_sub2 -= ublas_v2;
-    vcl_v1_sub2 -= vcl_v2;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Inplace mult/div //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1, vcl_v1);
-    viennacl::copy(ublas_v2, vcl_v2);
-    ScalarType s = 3.14;
-    viennacl::scalar<ScalarType>  vcl_s = s;
-
-    std::cout << "Multiplication with CPU scalar: ";
-    ublas_v1_sub1 *= s;
-    vcl_v1_sub1   *= s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Multiplication with GPU scalar: ";
-    ublas_v1_sub3 *= vcl_s;
-    vcl_v1_sub3   *= vcl_s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-
-    std::cout << "Division with CPU scalar: ";
-    ublas_v1_sub1 /= s;
-    vcl_v1_sub1   /= s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Division with GPU scalar: ";
-    ublas_v1_sub3 /= vcl_s;
-    vcl_v1_sub3   /= vcl_s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Vector Operations (norm_X, inner_prod, etc.) //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    for (std::size_t i=0; i<ublas_v1.size(); ++i) //reinit values
-      ublas_v1(i) = static_cast<ScalarType>(i+1);
-    
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    viennacl::copy(ublas_v1_sub2, vcl_v1_sub2);
-    viennacl::copy(ublas_v1_sub3, vcl_v1_sub3);
-    
-    double result_ublas = 0;
-    double result_viennacl = 0;
-
-    std::cout << "Testing norm_1: ";
-    result_ublas = norm_1(ublas_v1_sub2);
-    result_viennacl = viennacl::linalg::norm_1(vcl_v1_sub2);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing norm_2: ";
-    result_ublas = norm_2(ublas_v1_sub2);
-    result_viennacl = viennacl::linalg::norm_2(vcl_v1_sub2);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing norm_inf: ";
-    result_ublas = norm_inf(ublas_v1_sub2);
-    result_viennacl = viennacl::linalg::norm_inf(vcl_v1_sub2);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing inner_prod: ";
-    result_ublas = inner_prod(ublas_v1_sub1, ublas_v1_sub3);
-    result_viennacl = viennacl::linalg::inner_prod(vcl_v1_sub1, vcl_v1_sub3);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}    
-
-int main (int argc, const char * argv[])
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Test :: Vector Range" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-   
-  std::cout << "# Testing setup:" << std::endl;
-  //std::cout << "  eps:     " << 0 << std::endl;
-  std::cout << "  numeric: float" << std::endl;
-  if (run_test<float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << "# Testing setup:" << std::endl;
-    //std::cout << "  eps:     " << 0 << std::endl;
-    std::cout << "  numeric: double" << std::endl;
-    
-    if (run_test<double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-  }
-  
-  return EXIT_SUCCESS;
-}
-
diff --git a/tests/src/vector_slice.cpp b/tests/src/vector_slice.cpp
deleted file mode 100644
index 34f3c10..0000000
--- a/tests/src/vector_slice.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-//#define NDEBUG
-//#define VIENNACL_BUILD_INFO
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <time.h>
-//#include "../benchmarks/benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-/*#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"*/
-#include "viennacl/vector_proxy.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/vector_proxy.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-
-template <typename VectorType, typename VCLVectorType>
-bool check_for_equality(VectorType const & ublas_v, VCLVectorType const & vcl_v)
-{
-  typedef typename VectorType::value_type   value_type;
-  
-  std::vector<value_type> vcl_v_cpu(vcl_v.size());
-  viennacl::copy(vcl_v, vcl_v_cpu);
-
-  bool error_detected = false;
-  for (size_t i=0; i<ublas_v.size(); ++i)
-  {
-    if (ublas_v[i] != vcl_v_cpu[i])
-    {
-      //check whether there are just some round-off errors:
-      if (std::abs(ublas_v[i] - vcl_v_cpu[i]) / std::max(ublas_v[i], vcl_v_cpu[i]) > 1e-5)
-      {
-        std::cout << "Error at index (" << i << "): " << ublas_v[i] << " vs " << vcl_v_cpu[i] << std::endl;
-        error_detected = true;
-      }
-    }
-  }
-  
-  if (!error_detected)
-    std::cout << "PASSED!" << std::endl;
-  else
-  {
-    std::cout << std::endl << "TEST failed!";
-    return EXIT_FAILURE;
-  }
-  
-  return true;
-}
-
-
-           
-template <typename ScalarType>
-int run_test()
-{
-    typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-    
-    typedef viennacl::vector<ScalarType>                    VCLVectorType;
-    
-    std::size_t dim_large = 90;
-    std::size_t dim_small = 27;
-    
-    //setup ublas objects:
-    VectorType ublas_v1(dim_large);
-    for (std::size_t i=0; i<ublas_v1.size(); ++i)
-      ublas_v1(i) = static_cast<ScalarType>(i+1);
-
-    VectorType ublas_v2(dim_small);
-    for (std::size_t i=0; i<ublas_v2.size(); ++i)
-      ublas_v2(i) = static_cast<ScalarType>(dim_large + i);
-      
-    boost::numeric::ublas::slice ublas_s1(0, 2, dim_small);
-    boost::numeric::ublas::slice ublas_s2(dim_small - 1, 2, dim_small);
-    boost::numeric::ublas::slice ublas_s3(dim_large - 3 * dim_small, 3, dim_small);
-    boost::numeric::ublas::vector_slice<VectorType> ublas_v1_sub1(ublas_v1, ublas_s1);
-    boost::numeric::ublas::vector_slice<VectorType> ublas_v1_sub2(ublas_v1, ublas_s2);
-    boost::numeric::ublas::vector_slice<VectorType> ublas_v1_sub3(ublas_v1, ublas_s3);
-
-    //Setup ViennaCL objects    
-    VCLVectorType vcl_v1(dim_large);
-    viennacl::copy(ublas_v1, vcl_v1);
-    VCLVectorType vcl_v2(dim_small);
-    viennacl::copy(ublas_v2, vcl_v2);
-    
-    viennacl::slice vcl_s1(0, 2, dim_small);
-    viennacl::slice vcl_s2(dim_small - 1, 2, dim_small);
-    viennacl::slice vcl_s3(dim_large - 3 * dim_small, 3, dim_small);
-    viennacl::vector_slice<VCLVectorType>   vcl_v1_sub1(vcl_v1, vcl_s1);
-    viennacl::vector_slice<VCLVectorType>   vcl_v1_sub2(vcl_v1, vcl_s2);
-    viennacl::vector_slice<VCLVectorType>   vcl_v1_sub3(vcl_v1, vcl_s3);
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Copy to GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing copy to begin of v1... ";
-    ublas_v1_sub1 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub1);
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    std::cout << "Testing copy to middle of v1... ";
-    ublas_v1_sub2 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub2);
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    
-    std::cout << "Testing copy to bottom of v1... ";
-    ublas_v1_sub3 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub3);
-    check_for_equality(ublas_v1, vcl_v1);
-
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Copy from GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing beginning of v1... ";
-    check_for_equality(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing middle of v1... ";
-    check_for_equality(ublas_v1_sub2, vcl_v1_sub2);
-    
-    std::cout << "Testing bottom of v1... ";
-    check_for_equality(ublas_v1_sub3, vcl_v1_sub3);
-
-
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Assignments //////////" << std::endl;
-    std::cout << "//" << std::endl;
-
-    viennacl::copy(ublas_v1, vcl_v1);
-    viennacl::copy(ublas_v2, vcl_v2);
-
-    std::cout << "Testing vector assigned to slice... ";
-    ublas_v1_sub1 = ublas_v2;
-    vcl_v1_sub1 = vcl_v2;
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Testing slice assigned to vector... ";
-    ublas_v2 = ublas_v1_sub1;
-    vcl_v2 = vcl_v1_sub1;
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Testing slice assigned to slice... ";
-    ublas_v1_sub1 = ublas_v1_sub3;
-    vcl_v1_sub1 = vcl_v1_sub3;
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Inplace add //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing inplace add at beginning of v1: ";
-    ublas_v1_sub1 += ublas_v1_sub1;
-    vcl_v1_sub1 += vcl_v1_sub1;
-
-    check_for_equality(ublas_v1, vcl_v1);
-
-    std::cout << "Testing inplace add at middle of v1: ";
-    ublas_v1_sub2 += ublas_v1_sub2;
-    vcl_v1_sub2 += vcl_v1_sub2;
-    check_for_equality(ublas_v1, vcl_v1);
-
-
-    std::cout << "Testing inplace add at end of v1: ";
-    ublas_v1_sub3 += ublas_v1_sub3;
-    vcl_v1_sub3 += vcl_v1_sub3;
-    check_for_equality(ublas_v1, vcl_v1);
-
-    
-    std::cout << "Testing inplace add at end of v1: ";
-    ublas_v1_sub3 += ublas_v1_sub3;
-    vcl_v1_sub3 += vcl_v1_sub3;
-    check_for_equality(ublas_v1, vcl_v1);
-
-    std::cout << "Testing inplace add of vector with slice: ";
-    viennacl::copy(ublas_v2, vcl_v2);
-    ublas_v1_sub2 += ublas_v2;
-    vcl_v1_sub2 += vcl_v2;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Inplace sub //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing inplace sub at beginning of v1: ";
-    ublas_v1_sub1 -= ublas_v1_sub1;
-    vcl_v1_sub1 -= vcl_v1_sub1;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-
-    std::cout << "Testing inplace sub at middle of v1: ";
-    ublas_v1_sub2 -= ublas_v1_sub2;
-    vcl_v1_sub2 -= vcl_v1_sub2;
-
-    check_for_equality(ublas_v1, vcl_v1);
-
-
-    std::cout << "Testing inplace sub at end of v1: ";
-    ublas_v1_sub3 -= ublas_v1_sub3;
-    vcl_v1_sub3 -= vcl_v1_sub3;
-
-    check_for_equality(ublas_v1, vcl_v1);
-
-    std::cout << "Testing inplace sub of vector with slice: ";
-    viennacl::copy(ublas_v2, vcl_v2);
-    ublas_v1_sub2 -= ublas_v2;
-    vcl_v1_sub2 -= vcl_v2;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Inplace mult/div //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1, vcl_v1);
-    viennacl::copy(ublas_v2, vcl_v2);
-    ScalarType s = static_cast<ScalarType>(3.14);
-    viennacl::scalar<ScalarType>  vcl_s = s;
-
-    std::cout << "Multiplication with CPU scalar: ";
-    ublas_v1_sub1 *= s;
-    vcl_v1_sub1   *= s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Multiplication with GPU scalar: ";
-    ublas_v1_sub3 *= vcl_s;
-    vcl_v1_sub3   *= vcl_s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-
-    std::cout << "Division with CPU scalar: ";
-    ublas_v1_sub1 /= s;
-    vcl_v1_sub1   /= s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-    std::cout << "Division with GPU scalar: ";
-    ublas_v1_sub3 /= vcl_s;
-    vcl_v1_sub3   /= vcl_s;
-    
-    check_for_equality(ublas_v1, vcl_v1);
-    
-
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test: Vector Operations (norm_X, inner_prod, etc.) //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    for (std::size_t i=0; i<ublas_v1.size(); ++i) //reinit values
-      ublas_v1(i) = static_cast<ScalarType>(i+1);
-    
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    viennacl::copy(ublas_v1_sub2, vcl_v1_sub2);
-    viennacl::copy(ublas_v1_sub3, vcl_v1_sub3);
-    
-    double result_ublas = 0;
-    double result_viennacl = 0;
-
-    std::cout << "Testing norm_1: ";
-    result_ublas = norm_1(ublas_v1_sub2);
-    result_viennacl = viennacl::linalg::norm_1(vcl_v1_sub2);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing norm_2: ";
-    result_ublas = norm_2(ublas_v1_sub2);
-    result_viennacl = viennacl::linalg::norm_2(vcl_v1_sub2);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing norm_inf: ";
-    result_ublas = norm_inf(ublas_v1_sub2);
-    result_viennacl = viennacl::linalg::norm_inf(vcl_v1_sub2);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing inner_prod: ";
-    result_ublas = inner_prod(ublas_v1_sub1, ublas_v1_sub3);
-    result_viennacl = viennacl::linalg::inner_prod(vcl_v1_sub1, vcl_v1_sub3);
-    
-    if (std::abs(result_ublas - result_viennacl) / std::abs(result_ublas) < 1e-3)
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      std::cout << "Ublas: "    << result_ublas << std::endl;
-      std::cout << "ViennaCL: " << result_viennacl << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}    
-
-int main (int argc, const char * argv[])
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Test :: Vector Slice" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-   
-  std::cout << "# Testing setup:" << std::endl;
-  //std::cout << "  eps:     " << 0 << std::endl;
-  std::cout << "  numeric: float" << std::endl;
-  if (run_test<float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << "# Testing setup:" << std::endl;
-    //std::cout << "  eps:     " << 0 << std::endl;
-    std::cout << "  numeric: double" << std::endl;
-    
-    if (run_test<double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-  }
-  
-  return EXIT_SUCCESS;
-}
-
diff --git a/viennacl/generator/compound_node.hpp b/viennacl/generator/compound_node.hpp
deleted file mode 100644
index d6b7f0d..0000000
--- a/viennacl/generator/compound_node.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#ifndef VIENNACL_GENERATOR_COMPOUND_NODE_HPP
-#define VIENNACL_GENERATOR_COMPOUND_NODE_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file compound_node.hpp
- *  @brief Structures corresponding to binary nodes in the expression tree
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include <string>
-#include <sstream>
-#include <set>
-
-#include "viennacl/generator/forwards.h"
-#include "viennacl/generator/meta_tools/utils.hpp"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    /**
-    * @brief Binary node class for storing expression trees
-    * 
-    * @tparam LHS_ LHS of the expression
-    * @tparam OP_ Operator of the expression
-    * @tparam RHS_ RHS of the expression
-    * @tparam is_temporary_ Boolean for storing whether the binary node is temporary.
-    */
-    template<class LHS_, class OP_, class RHS_, bool is_temporary_>
-    class compound_node 
-    {
-      public:
-        typedef LHS_  LHS;
-        typedef RHS_  RHS;
-        typedef OP_   OP;
-
-        static const bool is_temporary = is_temporary_;
-
-        static const std::string name() 
-        {
-            return LHS::name() + "_" + OP::name() + "_" + RHS::name();
-        }
-    };
-
-    template<class LHS_, class RHS_, bool is_temporary_>
-    class compound_node<LHS_,inner_prod_type,RHS_, is_temporary_> 
-    {
-      public:
-        /**
-        * @brief Specialization for the inner product
-        */
-        typedef LHS_ LHS;
-        typedef RHS_ RHS;
-        typedef inner_prod_type OP;
-        typedef typename result_of::expression_type<RHS>::Result IntermediateType;  //Note: Visual Studio does not allow to combine this line with the next one directly.
-        typedef typename IntermediateType::ScalarType ScalarType;
-
-        static const bool is_temporary = is_temporary_;
-
-        enum { id = -2 };
-
-        static const std::string kernel_arguments() 
-        {
-          return  "__global float * " + name() + '\n';
-        }
-
-        static const std::string name() 
-        {
-          return  LHS::name() + "_inprod_" + RHS::name();
-        }
-
-        static const std::string scalar_name() 
-        {
-          return name() +"_s";
-        };
-
-    };
-
-    /**
-    * @brief Specialization for the matrix-vector product.
-    */
-    template<class LHS_, class RHS_, bool is_temporary_>
-    class compound_node<LHS_,prod_type,RHS_, is_temporary_> 
-    {
-      private:
-        typedef compound_node<LHS_,prod_type,RHS_, is_temporary_> self_type;
-
-      public:
-        typedef LHS_ LHS;
-        typedef RHS_ RHS;
-
-        typedef prod_type OP;
-        enum { id = LHS::id };
-
-        typedef typename result_of::expression_type<RHS>::Result IntermediateType;    //Note: Visual Studio does not allow to combine this line with the next one directly.
-        typedef typename IntermediateType::ScalarType ScalarType;
-        static const unsigned int Alignment = result_of::expression_type<RHS>::Result::Alignment;
-        static const bool is_temporary = is_temporary_;
-
-        static const std::string name() 
-        {
-          return LHS::name() + "_prod_" + RHS::name();
-        }
-
-        static const std::string size2_name() 
-        {
-          return "size_"+name();
-        }
-
-        static const std::string internal_size2_name() 
-        {
-          return "internal_size_"+name();
-        }
-        
-        static const std::string name_argument() 
-        {
-          return " __global " + print_type<ScalarType*,Alignment>::value() + " " + name();
-        }
-
-        static const std::string kernel_arguments() 
-        {
-          return name_argument() + ", unsigned int " + size2_name() + ", unsigned int " + internal_size2_name() + "\n" ;
-        }
-    };
-
-
-    /** @brief Addition operator on 2 elements of the same type */
-    template<class LHS_TYPE, class RHS_TYPE>
-    typename enable_if< is_same_expression_type<LHS_TYPE, RHS_TYPE>,
-                        compound_node<LHS_TYPE, add_type, RHS_TYPE> >::type
-    operator+ ( LHS_TYPE const & lhs, RHS_TYPE const & rhs ) 
-    {
-      return compound_node<LHS_TYPE, add_type, RHS_TYPE>();
-    }
-
-    /** @brief Substraction operator on 2 elements of the same type */
-    template<class LHS_TYPE, class RHS_TYPE>
-    typename enable_if< is_same_expression_type<LHS_TYPE, RHS_TYPE>,
-                        compound_node<LHS_TYPE, sub_type, RHS_TYPE> >::type
-    operator- ( LHS_TYPE const & lhs, RHS_TYPE const & rhs ) 
-    {
-      return compound_node<LHS_TYPE, sub_type, RHS_TYPE>();
-    }
-
-    /** @brief Helper for the inner_prod operator */
-    template<class LHS, class RHS>
-    struct make_inner_prod;
-
-    template<class LHS, class LHS_SIZE_DESCRIPTOR,
-             class RHS, class RHS_SIZE_DESCRIPTOR>
-    struct make_inner_prod<result_of::vector_expression<LHS, LHS_SIZE_DESCRIPTOR>,
-                           result_of::vector_expression<RHS, RHS_SIZE_DESCRIPTOR> > 
-    {
-      typedef compound_node<LHS,inner_prod_type,RHS,true> Result;
-    };
-
-
-    /** @brief Inner product operator */
-    template<class LHS, class RHS>
-    compound_node<LHS,inner_prod_type,RHS,true> inner_prod ( LHS vec_expr1,RHS vec_expr2 ) 
-    {
-      typedef typename result_of::expression_type<LHS>::Result LHS_TYPE;
-      typedef typename result_of::expression_type<RHS>::Result RHS_TYPE;
-      typename make_inner_prod<LHS_TYPE,RHS_TYPE>::Result result;
-      
-      return result;;
-    }
-
-    /** @brief Product operator */
-    template<class LHS, class RHS>
-    compound_node<LHS,prod_type,RHS> prod ( LHS vec_expr1,RHS vec_expr2 ) 
-    {
-      return compound_node<LHS,prod_type,RHS>();
-    }
-
-  } // namespace generator
-} // namespace viennacl
-
-#endif
-
diff --git a/viennacl/generator/custom_operation.hpp b/viennacl/generator/custom_operation.hpp
deleted file mode 100644
index a83cf9b..0000000
--- a/viennacl/generator/custom_operation.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-#ifndef VIENNACL_GENERATOR_CUSTOM_OPERATION_HPP
-#define VIENNACL_GENERATOR_CUSTOM_OPERATION_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file custom_operation.hpp
- *  @brief User Interface for making custom operations.
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include <vector>
-#include <set>
-#include <algorithm>
-
-#include "viennacl/generator/get_kernels_infos.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-#include "viennacl/generator/meta_tools/utils.hpp"
-
-
-namespace viennacl 
-{
-  namespace generator 
-  {
-
-    /** @brief A class for making a custom operation */
-    class custom_operation 
-    { 
-
-      public :
-
-        /** @brief CTor
-        * 	
-        * @param expression the expression to build the interface for
-        * @param program_name_hint the code for this expression will be stored in the program provided by this name
-        */
-        template<class T>
-        custom_operation ( T const & expression, std::string const & program_name_hint="" ) 
-        {
-          program_name_ = viennacl::generator::program_infos<T>::value (program_name_hint, sources_,runtime_wrappers_);
-          create_program ( static_cast<bool> ( viennacl::generator::tree_utils::count_if<T,viennacl::generator::is_inner_product_leaf>::value ) );
-        }
-        
-        /** @brief DTor */
-        ~custom_operation()
-        {
-          for (viennacl::generator::runtime_wrappers_t::iterator it  = runtime_wrappers_.begin();
-                                                                 it != runtime_wrappers_.end();
-                                                               ++it)
-          {
-            delete (it->second.second);
-          }
-        }
-
-        /** @brief Returns the list of the kernels involved in the operation */
-        viennacl::generator::KernelsSources const & kernels_sources() const 
-        {
-          return sources_;
-        }
-
-        /** @brief Return the generated sources */
-        std::string kernels_source_code() const 
-        {
-          std::string res;
-          for (viennacl::generator::KernelsSources::const_iterator it  = sources_.begin();
-                                                                   it != sources_.end();
-                                                                 ++it)
-          {
-            res += it->second + "\n";
-          }
-          
-          return res;
-        }
-
-        /** @brief Returns the program name */
-        std::string const & program_name() const  { return program_name_; }
-
-        
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0>
-        custom_operation & operator() ( T0 const & t0) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0, class T1>
-        custom_operation & operator() ( T0 const & t0, T1 const & t1 ) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0, class T1, class T2>
-        custom_operation & operator() ( T0 const & t0, T1 const & t1, T2 const & t2 ) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          user_args_.insert( std::make_pair(2, viennacl::any((T2*)&t2)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0, class T1, class T2, class T3>
-        custom_operation & operator() ( T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3 ) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          user_args_.insert( std::make_pair(2, viennacl::any((T2*)&t2)) );
-          user_args_.insert( std::make_pair(3, viennacl::any((T3*)&t3)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0, class T1, class T2, class T3, class T4>
-        custom_operation & operator() ( T0 & t0, T1 & t1, T2 & t2, T3 & t3, T4 & t4 ) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          user_args_.insert( std::make_pair(2, viennacl::any((T2*)&t2)) );
-          user_args_.insert( std::make_pair(3, viennacl::any((T3*)&t3)) );
-          user_args_.insert( std::make_pair(4, viennacl::any((T4*)&t4)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0, class T1, class T2, class T3, class T4, class T5>
-        custom_operation & operator() ( T0 & t0, T1 & t1, T2 & t2, T3 & t3, T4 & t4, T5 & t5 ) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          user_args_.insert( std::make_pair(2, viennacl::any((T2*)&t2)) );
-          user_args_.insert( std::make_pair(3, viennacl::any((T3*)&t3)) );
-          user_args_.insert( std::make_pair(4, viennacl::any((T4*)&t4)) );
-          user_args_.insert( std::make_pair(5, viennacl::any((T5*)&t5)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-        /** @brief Convenience for enqueuing the custom operation */
-        template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
-        custom_operation & operator() ( T0 & t0, T1 & t1, T2 & t2, T3 & t3, T4 & t4, T5 & t5, T6 & t6) 
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          user_args_.insert( std::make_pair(2, viennacl::any((T2*)&t2)) );
-          user_args_.insert( std::make_pair(3, viennacl::any((T3*)&t3)) );
-          user_args_.insert( std::make_pair(4, viennacl::any((T4*)&t4)) );
-          user_args_.insert( std::make_pair(5, viennacl::any((T5*)&t5)) );
-          user_args_.insert( std::make_pair(6, viennacl::any((T6*)&t6)) );
-          add_operation_arguments();
-          return *this;
-        }
-        
-        /** @brief Convenience for enqueuing the custom operation */
-        template <class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
-        custom_operation & operator() ( T0 & t0, T1 & t1, T2 & t2, T3 & t3, T4 & t4, T5 & t5, T6 & t6, T7 & t7 )
-        {
-          user_args_.insert( std::make_pair(0, viennacl::any((T0*)&t0)) );
-          user_args_.insert( std::make_pair(1, viennacl::any((T1*)&t1)) );
-          user_args_.insert( std::make_pair(2, viennacl::any((T2*)&t2)) );
-          user_args_.insert( std::make_pair(3, viennacl::any((T3*)&t3)) );
-          user_args_.insert( std::make_pair(4, viennacl::any((T4*)&t4)) );
-          user_args_.insert( std::make_pair(5, viennacl::any((T5*)&t5)) );
-          user_args_.insert( std::make_pair(6, viennacl::any((T6*)&t6)) );
-          user_args_.insert( std::make_pair(7, viennacl::any((T7*)&t7)) );
-          add_operation_arguments();
-          return *this;
-        }
-
-      private:
-
-        void create_program ( bool include_sum_kernel )
-        {
-          std::string kernels_string;
-          for (viennacl::generator::KernelsSources::iterator it  = sources_.begin(); 
-                                                             it != sources_.end(); 
-                                                           ++it ) 
-          {
-            kernels_string += it->second + "\n";
-          }
-          
-          viennacl::ocl::program& program = viennacl::ocl::current_context().add_program(kernels_string, program_name_);
-          
-          for (viennacl::generator::KernelsSources::iterator it  = sources_.begin(); 
-                                                             it != sources_.end(); 
-                                                           ++it) 
-          {
-            program.add_kernel(it->first);
-          }
-        }
-
-        
-        void add_operation_arguments() 
-        {
-          for (generator::runtime_wrappers_t::iterator it  = runtime_wrappers_.begin();
-                                                       it != runtime_wrappers_.end();
-                                                     ++it) 
-          {
-            std::string const & kernel_name = it->first;
-            viennacl::ocl::kernel& current_kernel = viennacl::ocl::current_context().get_program(program_name_).get_kernel(kernel_name);
-            const unsigned int arg_pos = it->second.first;
-            generator::result_of::runtime_wrapper * current_arg = it->second.second;
-        #ifdef VIENNACL_DEBUG_CUSTOM_OPERATION
-            std::cout << "Enqueuing : Kernel " << kernel_name << " Argument : " << current_arg->name() << " | Pos : " << arg_pos << std::endl;
-        #endif
-            current_arg->enqueue(arg_pos,current_kernel,user_args_,temporaries_);
-          }
-        }
-
-    private :
-        typedef std::map<std::string, unsigned int> CurrentArgsContainer;
-        CurrentArgsContainer current_args_pos_;
-            
-        std::map<unsigned int, viennacl::any> user_args_;
-        
-        std::string program_name_;
-        
-        std::vector<viennacl::ocl::local_mem> lmem_;
-        
-        viennacl::generator::KernelsSources sources_;
-        
-        viennacl::generator::runtime_wrappers_t runtime_wrappers_;
-            
-        std::map<std::string, viennacl::ocl::handle<cl_mem> > temporaries_;
-    };
-
-
-    inline void enqueue_custom_op(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue)
-    {
-      for(std::map<std::string,std::string>::const_iterator it = op.kernels_sources().begin(); it != op.kernels_sources().end() ; ++it)
-      {
-        std::string current_kernel_name = it->first;
-        #ifdef VIENNACL_DEBUG_CUSTOM_OPERATION
-        std::cout << "Enqueueing " << current_kernel_name << std::endl;
-        #endif
-        enqueue(viennacl::ocl::current_context().get_program(op.program_name()).get_kernel(current_kernel_name));
-      }
-    }
-    
-  }
-}
-
-#endif
diff --git a/viennacl/generator/elementwise_modifier.hpp b/viennacl/generator/elementwise_modifier.hpp
deleted file mode 100644
index c9c72e2..0000000
--- a/viennacl/generator/elementwise_modifier.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef VIENNACL_GENERATOR_ELEMENTWISE_MODIFIER_HPP
-#define VIENNACL_GENERATOR_ELEMENTWISE_MODIFIER_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/elementwise_modifier.hpp
- *   @brief Contains the stuffs related to the elementwise_modifier
- * 
- *  Generator code contributed by Philippe Tillet
- */
-
-#include <typeinfo>
-#include <string>
-#include <algorithm>
-
-#include "viennacl/generator/forwards.h"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    /**
-    * @brief Implementation of the elementwise_modifier
-    * 
-    * @tparam T the underlying expression to modify
-    * @tparam U the function returning the modifier's expression
-    */
-    template<class T, std::string (*U)()>
-    struct elementwise_modifier_impl
-    {
-      private:
-        static std::string expr_name()
-        {
-          std::string res = U();
-          std::replace(res.begin(),res.end(),'/','o');
-          std::replace(res.begin(),res.end(),'*','x');
-          std::replace(res.begin(),res.end(),'+','a');
-          std::replace(res.begin(),res.end(),'-','s');
-          std::replace(res.begin(),res.end(),' ','_');
-          std::replace(res.begin(),res.end(),'(','p');
-          std::replace(res.begin(),res.end(),')','p');
-          return res;
-        }
-        
-      public:
-        typedef T PRIOR_TYPE;
-
-        enum { id = -2 };
-
-        static std::string name()
-        {
-          return expr_name() + '_' + T::name();
-        }
-        
-        static std::string modify(std::string const & replacer) 
-        {
-          std::string result(U());
-          int pos;
-          while( (pos = result.find('X')) != std::string::npos )
-          {
-            result.replace(pos, 1, '(' + replacer + ')' );
-          }
-          
-          return result;
-        }
-    };
-
-    /** @brief Operator for creating an elementwise_modifier from an expression */
-    template<std::string (*U)(),class T>
-    elementwise_modifier_impl<T,U> elementwise_modifier( T const & t ) 
-    {
-      return elementwise_modifier_impl<T,U>();
-    }
-
-  }
-}
-
-#endif
diff --git a/viennacl/generator/get_kernels_infos.hpp b/viennacl/generator/get_kernels_infos.hpp
deleted file mode 100644
index 6e00bbd..0000000
--- a/viennacl/generator/get_kernels_infos.hpp
+++ /dev/null
@@ -1,579 +0,0 @@
-#ifndef VIENNACL_GENERATOR_CREATE_KERNEL_HPP
-#define VIENNACL_GENERATOR_CREATE_KERNEL_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/get_kernels_infos.hpp
- *  @brief Provides information about kernels
- * 
- *  Generator code contributed by Philippe Tillet
- */
-
-// #include "kernel_utils.hpp"
-
-#include <map>
-
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/operation_types.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_matrix.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_vector.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_scalars.hpp"
-#include "viennacl/generator/tree_operations.hpp"
-#include "viennacl/generator/tokens_management.hpp"
-#include "viennacl/generator/make_code/make_code.hpp"
-#include "viennacl/generator/meta_tools/typelist.hpp"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    template <class T, bool is_first = true>
-    struct arguments_list;
-
-    template <bool is_first>
-    struct arguments_list<NullType, is_first > 
-    {
-      static const std::string  string_value() { return ""; }
-    };
-
-
-    template <class Head, class Tail, bool is_first >
-    struct arguments_list<typelist<Head,Tail>, is_first > 
-    {
-      private:
-        static const std::string add_comma ( Int2Type<false> ) { return ", "; }
-        static const std::string add_comma ( Int2Type<true> )  { return ""; }
-
-      public:
-        static const std::string string_value() 
-        {
-            return add_comma ( Int2Type<is_first>() )
-                  + Head::kernel_arguments()
-                  + arguments_list<Tail,false>::string_value();
-        }
-    };
-
-    template<class T>
-    struct requires_local_buffer 
-    {
-      enum { value = is_inner_product_leaf<T>::value };
-    };
-
-    template<class T>
-    struct requires_local_buffer<inner_prod_impl_t<T> > 
-    {
-      enum { value = 1 };
-    };
-
-    template<class T>
-    struct requires_local_buffer_list;
-
-    template<class Head, class Tail>
-    struct requires_local_buffer_list<typelist<Head, Tail> > 
-    {
-      enum { value = static_cast<bool> ( tree_utils::count_if<Head, requires_local_buffer>::value
-                                        || requires_local_buffer_list<Tail>::value )
-           };
-    };
-
-    template<class Head>
-    struct requires_local_buffer_list<typelist<Head, NullType> > 
-    {
-      enum { value = static_cast<bool> ( tree_utils::count_if<Head, requires_local_buffer >::value ) };
-    };
-
-    template<class TLIST, class ASSIGNED>
-    struct calculate_tokens 
-    {
-      typedef typename TLIST::Head::first_type current_token;
-      typedef typename TLIST::Head::second_type token_op;
-
-      static const std::string value() 
-      {
-        return make_code<current_token, token_op, ASSIGNED>::value()
-              + calculate_tokens<typename TLIST::Tail, ASSIGNED>::value();
-      }
-    };
-
-    template<class ASSIGNED>
-    struct calculate_tokens<NullType, ASSIGNED> 
-    {
-      static const std::string value() { return ""; }
-    };
-
-    template<class T>
-    struct get_temporary_dependancies
-    {
-      typedef typename get_temporary_dependancies<typename result_of::expression_type<T>::Result>::Result   Result;
-    };
-
-    template<class T, class SIZE_DESCRIPTOR>
-    struct get_temporary_dependancies<result_of::vector_expression<T,SIZE_DESCRIPTOR> >
-    {
-      typedef SIZE_DESCRIPTOR Result;
-    };
-
-    template<class T>
-    struct get_temporary_dependancies<result_of::scalar_expression<T> >
-    {
-      typedef NullType Result;
-    };
-
-    template<>
-    struct get_temporary_dependancies<NullType>
-    {
-      typedef NullType Result;
-    };
-
-    template<class Head, class Tail>
-    struct get_temporary_dependancies<typelist<Head,Tail> >
-    {
-      typedef typename typelist_utils::append<typename get_temporary_dependancies<Tail>::Result,
-                                              typename get_temporary_dependancies<Head>::Result>::Result   Result;
-    };
-
-
-    template<class T>
-    struct get_kernel_arguments;
-
-    template<class LHS, class OP, class RHS, bool _is_temporary>
-    struct get_kernel_arguments<compound_node<LHS,OP,RHS,_is_temporary> > 
-    {
-      typedef compound_node<LHS,OP,RHS,_is_temporary> Arg;
-      typedef typename tree_utils::extract_if<Arg, is_regular_kernel_parameter, typelist_utils::compare1>::Result RegularLeafs;
-      typedef typename tree_utils::extract_if<Arg, is_temporary_kernel_parameter>::Result                         TemporaryLeafs;
-      typedef typename get_temporary_dependancies<TemporaryLeafs>::Result                                         TemporaryDependancies;
-      typedef typename typelist_utils::fuse<RegularLeafs, TemporaryLeafs, typelist_utils::compare1>::Result       TmpResult0;
-      typedef typename typelist_utils::fuse<TmpResult0, TemporaryDependancies, typelist_utils::compare1>::Result  TmpResult1;
-      typedef typename typelist_utils::no_duplicates<TmpResult1>::Result Result;
-    };
-
-    template<class Head, class Tail>
-    struct get_kernel_arguments<typelist<Head,Tail> > 
-    {
-      typedef typename typelist_utils::fuse<typename get_kernel_arguments<Head>::Result, 
-                                            typename get_kernel_arguments<Tail>::Result,
-                                            typelist_utils::compare1>::Result                TmpResult;
-      typedef typename typelist_utils::no_duplicates<TmpResult>::Result                      Result;
-    };
-
-    template<>
-    struct get_kernel_arguments<NullType> 
-    {
-      typedef NullType Result;
-    };
-
-    template<class TreeList>
-    struct kernel_header;
-
-    template<class Head, class Tail>
-    struct kernel_header<typelist<Head, Tail> > 
-    {
-      private:
-        typedef typelist<Head, Tail> Arg;
-        typedef typename tree_utils::expand<Head>::Result             ExpandedHead;
-        typedef typename tree_utils::flip_tree<ExpandedHead>::Result  NewHead;
-        typedef typename get_kernel_arguments<Arg>::Result            Arguments;
-        
-        static const std::string shared_memory ( Int2Type<false> ) { return ""; }
-        static const std::string shared_memory ( Int2Type<true> )  { return ",__local float* shared_memory_ptr\n"; }
-
-      public:
-        static const std::string value ( std::string const & name )
-        {
-          return "__kernel void " + name + "(\n"
-                + arguments_list<Arguments>::string_value()
-                + shared_memory ( Int2Type<requires_local_buffer_list<Arg>::value>() )
-                + ")\n";
-        }
-    };
-
-
-    template<class TreeList, bool is_in_temporary_kernel, bool is_first = true>
-    struct kernel_core;
-
-    template<class TList>
-    struct finalize_inner_products 
-    {
-      static const std::string value() { return ""; }
-    };
-
-    template<class Head, class Tail>
-    struct finalize_inner_products<typelist<Head,Tail> > 
-    {
-      static const std::string value() 
-      {
-          return make_code<Head,assign_type,Head>::value() + finalize_inner_products<Tail>::value();
-      }
-    };
-
-    template<class Head, class Tail, bool is_in_temporary_kernel, bool is_first>
-    struct kernel_core<typelist<Head, Tail>, is_in_temporary_kernel, is_first > 
-    {
-      private:
-        typedef typelist<Head, Tail> Arg;
-        typedef typename tree_utils::expand<Head>::Result ExpandedHead;
-        typedef typename tree_utils::flip_tree<ExpandedHead>::Result NewHead;
-        typedef typename generate_tokens<NewHead, is_in_temporary_kernel>::Result Tokens;
-        typedef typename tree_utils::extract_if<typename NewHead::RHS,is_inner_product_leaf>::Result InProdsT;
-        typedef typename typelist_utils::no_duplicates<InProdsT>::Result InProds;
-        typedef typename Head::LHS LHS;
-
-        static const std::string additional_declarations ( Int2Type<true> ) {
-            return  "float sum;\n";
-        }
-
-        static const std::string additional_declarations ( Int2Type<false> ) { return  "" ; }
-
-        
-        static const std::string head ( Int2Type<true> ) {
-            return  "{\n"
-                    + additional_declarations ( Int2Type<requires_local_buffer_list<Arg>::value>() );
-        }
-
-        static const std::string head ( Int2Type<false> ) { return  "\n" ; }
-
-      public:
-        static const std::string value() 
-        {
-            return head ( Int2Type<is_first>() )
-                  + finalize_inner_products<InProds>::value()
-                  + calculate_tokens<Tokens, LHS>::value()
-                  + kernel_core<Tail, is_in_temporary_kernel, false>::value();
-        }
-    };
-
-    template<bool is_in_temporary_kernel>
-    struct kernel_core<NullType, is_in_temporary_kernel, false> 
-    {
-      static const std::string value() { return "}"; }
-    };
-
-    template<class T>
-    struct remove_temporary 
-    {
-      typedef T Result;
-    };
-
-    template<class Ref>
-    struct remove_temporary<tmp_symbolic_vector<Ref> > 
-    {
-      typedef Ref Result;
-    };
-
-    template<class LHS, class OP, class RHS>
-    struct remove_temporary<compound_node<LHS,OP,RHS,true> > 
-    {
-      typedef compound_node<LHS,OP,RHS> Result;
-    };
-
-    template<class TreeList, class Assigned>
-    struct get_all_temporaries 
-    {
-      private:
-        typedef typename TreeList::Head Head;
-        typedef typename TreeList::Tail Tail;
-        typedef typename remove_temporary<Head>::Result                                    NewHead;
-        typedef typename tree_utils::register_temporaries<NewHead, true, Assigned>::Result Registered;
-        typedef typename tree_utils::extract_if<Registered,is_temporary>::Result           Temporaries;
-        typedef typename get_all_temporaries<Tail, Assigned>::Result                       NewList;
-        
-      public:
-        typedef typename typelist_utils::fuse<Temporaries, NewList>::Result      Result;
-    };
-
-    template<class Assigned>
-    struct get_all_temporaries<NullType, Assigned> 
-    {
-      typedef NullType Result;
-    };
-
-    template<class T>
-    struct Unroll
-    {
-      typedef NullType Result;
-    };
-
-    template<class Head, class Tail>
-    struct Unroll<typelist<Head, Tail> > 
-    {
-      typedef typename typelist_utils::fuse<Head,
-                                            typename Unroll<Tail>::Result >::Result    Result;
-    };
-
-    template<class HeadHead, class HeadTail, class Tail>
-    struct Unroll<typelist<typelist<HeadHead, HeadTail>, Tail> > 
-    {
-      typedef typename typelist_utils::fuse<typelist<HeadHead, HeadTail>,
-                                            typename Unroll<Tail>::Result >::Result    Result;
-    };
-
-    template<>
-    struct Unroll<typelist<NullType, NullType> > 
-    {
-      typedef NullType Result;
-    };
-
-    template<class T>
-    struct find_prior_implementations 
-    {
-      typedef T Result;
-    };
-
-    template<class LHS, class RHS>
-    struct find_prior_implementations<compound_node<LHS, inner_prod_type, RHS,true> > 
-    {
-      typedef inner_prod_impl_t<compound_node<LHS, inner_prod_type, RHS,true> > Result;
-    };
-
-    template<class Head, class Tail>
-    struct find_prior_implementations<typelist<Head,Tail> > 
-    {
-      private:
-        typedef typename find_prior_implementations<Head>::Result NewHead;
-        typedef typename find_prior_implementations<Tail>::Result NewTail;
-        
-      public:
-        typedef typelist<NewHead,NewTail> Result;
-    };
-
-    template<class TreeList, class Assigned>
-    struct register_kernels 
-    {
-      private:
-        typedef typename get_all_temporaries<TreeList, Assigned>::Result Temporaries;
-        typedef typename register_kernels<Temporaries, Assigned>::Result CurrentList;
-        typedef typename typelist_utils::erase<Temporaries,
-                                               typename Unroll<CurrentList>::Result>::Result  NextTemporaries;
-        typedef typename find_prior_implementations<NextTemporaries>::Result                  NextList;
-
-      public:
-        typedef typename typelist_utils::append<CurrentList, NextList>::Result Result;
-    };
-
-    template<class Assigned>
-    struct register_kernels<NullType, Assigned> 
-    {
-      typedef typelist<NullType,NullType> Result;
-    };
-
-    template<class TreeList, bool is_in_temporary_kernel, class Enable = void>
-    struct kernel_string 
-    {
-      static const std::string value(std::string name) 
-      {
-        return std::string ( kernel_header<TreeList>::value(name)
-                             + kernel_core<TreeList, is_in_temporary_kernel>::value() );
-      }
-    };
-
-    template<class T>
-    struct make_impl;
-
-    template<class LHS_, class RHS_, bool is_temporary_>
-    struct make_impl<compound_node< LHS_, inner_prod_type, RHS_, is_temporary_ > > 
-    {
-      typedef inner_prod_impl_t<compound_node< LHS_, inner_prod_type, RHS_, is_temporary_ > >   Result;
-    };
-
-    template<class Temporary>
-    struct format_temporaries;
-
-    template<class Head, class Tail>
-    struct format_temporaries<typelist<Head,Tail> > 
-    {
-        typedef compound_node<Head, assign_type, typename remove_temporary<Head>::Result>                    NewHead;
-        typedef typename typelist_utils::append<typename format_temporaries<Tail>::Result, NewHead>::Result  Result;
-    };
-
-    template<>
-    struct format_temporaries<NullType> 
-    {
-      typedef NullType Result;
-    };
-
-    typedef std::map<std::string,std::string> KernelsSources;
-
-    template<class TemporaryKernelsList, class MainOperation, int Start, int End>
-    struct fill_sources
-    {
-      typedef typename format_temporaries<typename typelist_utils::type_at< TemporaryKernelsList,Start>::Result >::Result CurrentList;
-      
-      static void execute(KernelsSources & sources, std::string const & operation_name)
-      {
-        std::string current_kernel_name("__" + operation_name + "_kernel"  
-                                        + to_string(typelist_utils::length<TemporaryKernelsList>::value - 1 - Start));
-        sources.insert( std::make_pair( current_kernel_name,
-                                        kernel_string< CurrentList, true>::value(current_kernel_name) 
-                                      )
-                      );
-        fill_sources<TemporaryKernelsList, MainOperation, Start+1, End>::execute(sources, operation_name);
-      }
-    };
-
-    template<class TemporaryKernelsList, class MainOperation, int End>
-    struct fill_sources<TemporaryKernelsList, MainOperation, End, End>
-    {
-      static void execute(KernelsSources & sources, std::string const & operation_name)
-      {
-        sources.insert(std::make_pair(operation_name,
-                                      kernel_string<MainOperation, false>::value(operation_name) 
-                                     )
-                      );
-      }
-    };
-
-    typedef std::multimap<std::string, std::pair<unsigned int, result_of::runtime_wrapper*> > runtime_wrappers_t;
-
-    template<class U>
-    struct foreach_functor
-    {
-      static void execute(unsigned int & arg_pos,  runtime_wrappers_t & runtime_wrappers, std::string const & name) 
-      {
-        foreach_functor<typename result_of::expression_type<U>::Result >::execute(arg_pos, runtime_wrappers, name);
-      }
-    };
-
-    template<>
-    struct foreach_functor<NullType>;
-
-    template<class T>
-    struct foreach_functor<result_of::scalar_expression<T> >
-    {
-      static void execute(unsigned int & arg_pos, runtime_wrappers_t & runtime_wrappers, std::string const & name) 
-      {
-        runtime_wrappers.insert(runtime_wrappers_t::value_type(name,
-                                                               std::make_pair(arg_pos,
-                                                                              result_of::scalar_expression<T>::runtime_descriptor())
-                                                              )
-                               );
-        arg_pos += 1;
-      }
-    };
-
-    template<class T, class SIZE_DESCRIPTOR>
-    struct foreach_functor<result_of::vector_expression<T,SIZE_DESCRIPTOR> >
-    {
-      static void execute(unsigned int & arg_pos, runtime_wrappers_t & runtime_wrappers, std::string const & name) 
-      {
-        runtime_wrappers.insert(runtime_wrappers_t::value_type(name,
-                                                               std::make_pair(arg_pos,
-                                                                              result_of::vector_expression<T,SIZE_DESCRIPTOR>::runtime_descriptor())
-                                                              )
-                               );
-        arg_pos += 3;
-      }
-    };
-
-    template<class T, class SIZE1_DESCRIPTOR, class SIZE2_DESCRIPTOR>
-    struct foreach_functor<result_of::matrix_expression<T,SIZE1_DESCRIPTOR, SIZE2_DESCRIPTOR> >
-    {
-      static void execute(unsigned int & arg_pos, runtime_wrappers_t & runtime_wrappers, std::string const & name ) 
-      {
-        runtime_wrappers.insert(runtime_wrappers_t::value_type(name,
-                                                               std::make_pair(arg_pos,
-                                                                              result_of::matrix_expression<T,SIZE1_DESCRIPTOR,SIZE2_DESCRIPTOR>::runtime_descriptor())
-                                                              )
-                               );
-        arg_pos += 5;
-      }
-    };
-
-    template<class TemporaryKernelsList, class MainOperation, int Start, int End>
-    struct fill_args
-    {
-      typedef typename format_temporaries<typename typelist_utils::type_at<TemporaryKernelsList,Start>::Result >::Result CurrentList;
-      typedef typename get_kernel_arguments<CurrentList>::Result Arguments;
-      
-      static void execute(runtime_wrappers_t & runtime_wrappers, std::string const & operation_name)
-      {
-        unsigned int arg_pos = 0;
-        std::string current_kernel_name("__"+operation_name+"_kernel"
-                                        + to_string(typelist_utils::length<TemporaryKernelsList>::value - 1 - Start));
-        
-        typelist_utils::ForEach<Arguments, foreach_functor>::execute(arg_pos,runtime_wrappers,current_kernel_name);
-        
-        if(requires_local_buffer_list<CurrentList>::value)
-        {
-          runtime_wrappers.insert(runtime_wrappers_t::value_type(current_kernel_name,
-                                                                 std::make_pair(arg_pos,
-                                                                                new result_of::shared_memory_wrapper())
-                                                                )
-                                 );
-        }
-        
-        fill_args<TemporaryKernelsList,MainOperation,Start+1,End>::execute(runtime_wrappers,operation_name);
-      }
-    };
-
-    template<class TemporaryKernelsList, class MainOperation, int End>
-    struct fill_args<TemporaryKernelsList, MainOperation, End, End>
-    {
-      private:
-        typedef MainOperation CurrentList;
-        typedef typename get_kernel_arguments<CurrentList>::Result Arguments;
-            
-      public:
-        static void execute(runtime_wrappers_t & runtime_wrappers, std::string const & operation_name)
-        {
-          unsigned int arg_pos = 0;  
-          typelist_utils::ForEach<Arguments, foreach_functor>::execute(arg_pos, runtime_wrappers, operation_name);
-          if(requires_local_buffer_list<CurrentList>::value)
-          {
-            runtime_wrappers.insert(runtime_wrappers_t::value_type(operation_name,
-                                                                   std::make_pair(arg_pos,
-                                                                                  new result_of::shared_memory_wrapper())
-                                                                  )
-                                   );
-          }
-        }
-    };
-
-    template<class ARG>
-    struct program_infos
-    {
-      typedef typename tree_utils::register_temporaries<ARG,false, typename ARG::LHS>::Result   NewARG;
-      typedef typelist<NewARG,NullType>                                                         MainOperation_Init;
-      typedef typename register_kernels<MainOperation_Init, typename ARG::LHS>::Result          KernelsList;
-      
-      static std::string value(std::string const & name_hint, KernelsSources & sources, runtime_wrappers_t & runtime_wrappers)
-      {
-        std::string operation_name = ARG::name();
-        std::string program_name( (!name_hint.empty()) ? name_hint : operation_name );
-        
-        fill_sources<KernelsList,
-                     MainOperation_Init,
-                     0,
-                     typelist_utils::length<KernelsList>::value - 1>::execute(sources,operation_name);
-                     
-        fill_args<KernelsList,
-                  MainOperation_Init,
-                  0,
-                  typelist_utils::length<KernelsList>::value - 1>::execute(runtime_wrappers,operation_name);
-                  
-        return program_name;
-      }
-    };
-
-
-
-  } // namespace generator
-} // namespace viennacl
-#endif
diff --git a/viennacl/generator/make_code/expression.hpp b/viennacl/generator/make_code/expression.hpp
deleted file mode 100644
index 5ae8dc9..0000000
--- a/viennacl/generator/make_code/expression.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#ifndef VIENNACL_GENERATOR_MAKE_CODE_EXPRESSION_HPP
-#define VIENNACL_GENERATOR_MAKE_CODE_EXPRESSION_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/make_code/expression.hpp
- *   @brief Directives for generating code for simple expressions.
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/elementwise_modifier.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_scalars.hpp"
-#include "viennacl/generator/meta_tools/utils.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    template <class T>
-    struct make_expression_code 
-    {
-      static const std::string value(std::string const & loop_accessor) 
-      {
-        return  T::name() + '[' + loop_accessor  + ']';
-      }
-    };
-
-    template <unsigned int ID, class SCALARTYPE>
-    struct make_expression_code<cpu_symbolic_scalar<ID,SCALARTYPE> > 
-    {
-      static const std::string value(std::string const & loop_accessor) 
-      {
-        return  cpu_symbolic_scalar<ID,SCALARTYPE>::name();
-      }
-    };
-
-    template <unsigned int ID,class SCALARTYPE>
-    struct make_expression_code<gpu_symbolic_scalar<ID,SCALARTYPE> > 
-    {
-      static const std::string value(std::string const & loop_accessor) 
-      {
-        return  '*' + gpu_symbolic_scalar<ID,SCALARTYPE>::name();
-      }
-    };
-
-    template <class LHS, class RHS, bool is_temporary >
-    struct make_expression_code<compound_node<LHS,inner_prod_type,RHS, is_temporary> > 
-    {
-      private:
-        typedef compound_node<LHS,inner_prod_type,RHS, is_temporary> T;
-        
-      public:
-        static const std::string value(std::string const & loop_accessor) 
-        {
-          return T::name() +"_sum";
-        }
-    };
-
-    template< >
-    struct make_expression_code< NullType > 
-    {
-      static const std::string value(std::string const & loop_accessor) 
-      {
-          return "0";
-      }
-    };
-
-    template<class T, std::string (*U)()>
-    struct make_expression_code< elementwise_modifier_impl<T, U> > 
-    {
-      typedef elementwise_modifier_impl<T, U> EW_M;
-      static const std::string value ( std::string const & loop_accessor ) 
-      {
-        return EW_M::modify(make_expression_code<T>::value(loop_accessor));
-      }
-    };
-
-    template<class LHS, class OP, class RHS >
-    struct make_expression_code<compound_node<LHS, OP, RHS, false> > 
-    {
-      static const std::string value(std::string const & loop_accessor = "k") 
-      {
-        return make_expression_code<LHS>::value(loop_accessor)
-               + OP::expression_string() 
-               + make_expression_code<RHS>::value(loop_accessor);
-      }
-    };
-
-    template<class LHS, class RHS, unsigned int Alignment>
-    struct dot_product_impl
-    {
-      static const std::string value(std::string lhs_loop_id,
-                                     std::string rhs_loop_id)
-      {
-        return "dot(" + make_expression_code<LHS>::value(lhs_loop_id) + "," + make_expression_code<RHS>::value(rhs_loop_id) + ")";
-      }
-    };
-
-    template<class LHS, class RHS>
-    struct dot_product_impl<LHS, RHS, 8>
-    {
-      static const std::string value(std::string lhs_loop_id,
-                                     std::string rhs_loop_id)
-      {
-        return "dot(" + make_expression_code<LHS>::value(lhs_loop_id) + ".s0123" + ","
-                      + make_expression_code<RHS>::value(rhs_loop_id) + ".s0123 )" 
-         +  " + dot("	+ make_expression_code<LHS>::value(lhs_loop_id) + ".s4567" + ","
-                      + make_expression_code<RHS>::value(rhs_loop_id) + ".s4567 );"
-        ;
-      }
-    };
-
-    template<class LHS, class RHS>
-    struct dot_product_impl<LHS, RHS, 16>
-    {
-      static const std::string value(std::string lhs_loop_id,std::string rhs_loop_id)
-      {
-        return "dot(" + make_expression_code<LHS>::value(lhs_loop_id) + ".s0123" + ","
-                      + make_expression_code<RHS>::value(rhs_loop_id) + ".s0123)" 
-        +"\n	+ dot("	+ make_expression_code<LHS>::value(lhs_loop_id) + ".s4567" + "," 
-                      + make_expression_code<RHS>::value(rhs_loop_id) + ".s4567) "
-        +"\n	+ dot("	+ make_expression_code<LHS>::value(lhs_loop_id) + ".s89ab" + "," 
-                      + make_expression_code<RHS>::value ( rhs_loop_id ) + ".s89ab) "
-        +"\n	+ dot("	+ make_expression_code<LHS>::value ( lhs_loop_id ) + ".scdef" + "," 
-                      + make_expression_code<RHS>::value ( rhs_loop_id ) + ".scdef)" 
-        ;
-      }
-    };
-
-    template<class LHS, class RHS>
-    struct dot_product
-    {
-      static const std::string value(std::string lhs_loop_id,std::string rhs_loop_id)
-      {
-        return dot_product_impl<LHS,RHS,LHS::Alignment>::value(lhs_loop_id,rhs_loop_id);
-      }
-    };
-
-  }
-
-}
-
-#endif
-
-
diff --git a/viennacl/generator/make_code/inner_product.hpp b/viennacl/generator/make_code/inner_product.hpp
deleted file mode 100644
index 101c1fb..0000000
--- a/viennacl/generator/make_code/inner_product.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef VIENNACL_GENERATOR_MAKE_CODE_INNER_PRODUCT_HPP
-#define VIENNACL_GENERATOR_MAKE_CODE_INNER_PRODUCT_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/make_code/inner_product.hpp
- *   @brief Directives for generating code for the inner product.
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include "viennacl/generator/make_code/expression.hpp"
-#include "viennacl/generator/meta_tools/utils.hpp"
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-#include "viennacl/generator/tree_operations.hpp"
-
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    template <class T>
-    struct inner_prod_impl_t 
-    {
-      typedef T PRIOR_TYPE;
-      
-      static const std::string name() 
-      {
-        return T::name();
-      }
-      
-      static const std::string kernel_arguments() 
-      {
-        return T::kernel_arguments();
-      }
-      enum { id = T::id };
-    };
-
-    template <class TOKEN, class OP, class ASSIGNED, class Enable=void>
-    struct make_code;
-
-    template <class T, class OP, class ASSIGNED>
-    struct make_code<inner_prod_impl_t<T>, OP, ASSIGNED> 
-    {
-      private:
-        typedef typename tree_utils::extract_if<T,is_pure_inner_product_leaf>::Result::Head ARG;
-        typedef typename ARG::LHS LHS;
-        typedef typename ARG::RHS RHS;
-
-        static const std::string main_size() 
-        {
-          return result_of::expression_type<LHS>::Result::internal_size_expression();
-        }
-
-      public :
-
-        static const std::string value() 
-        {
-          return  "sum = 0;\n"
-                  "for (unsigned int k = (get_group_id(0) * " + main_size() + ")/get_num_groups(0)+ get_local_id(0); k < ((get_group_id(0)+1) * " + main_size() +")/get_num_groups(0); k += get_local_size(0))\n"
-                  "  sum += " + dot_product<LHS,RHS>::value("k","k") + ";\n"
-                  "shared_memory_ptr[get_local_id(0)] = sum;\n"
-
-                  "for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
-                  "  {\n"
-                  "    barrier(CLK_LOCAL_MEM_FENCE);\n"
-                  "    if (get_local_id(0) < stride)\n"
-                  "    shared_memory_ptr[get_local_id(0)] += shared_memory_ptr[get_local_id(0)+stride];\n"
-                  "  }\n"
-                  "barrier(CLK_LOCAL_MEM_FENCE);\n"
-                  "if (get_local_id(0) == 0)\n"
-                  "  " + ASSIGNED::name() + "[get_group_id(0)] = shared_memory_ptr[0];\n";
-        }
-        
-        viennacl::generator::compound_node< const char*, add_type, const char* > value(const char* arg1);
-    };
-
-    template <class T, class OP>
-    struct make_code<T, OP, T, typename enable_if<is_inner_product_leaf<T> >::type> 
-    {
-      private:
-        typedef typename tree_utils::extract_if<T,is_pure_inner_product_leaf>::Result::Head ARG;
-        typedef typename ARG::LHS LHS;
-        typedef typename ARG::RHS RHS;
-
-      public:
-
-        static const std::string value() 
-        {
-          return  "sum = 0;\n"
-                  "local float " + ARG::name() + "_sum;\n"
-                  "for (unsigned int i = get_local_id(0) ; i<get_num_groups(0) ; i+=get_local_size(0))\n"
-                  "{\n"
-                  "   sum+= " +ARG::name() +"[i];\n"
-                  "};\n"
-                  "shared_memory_ptr[get_local_id(0)]=sum;\n"
-                  "for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
-                  "  {\n"
-                  "    barrier(CLK_LOCAL_MEM_FENCE);\n"
-                  "    if (get_local_id(0) < stride)\n"
-                  "     shared_memory_ptr[get_local_id(0)] += shared_memory_ptr[get_local_id(0)+stride];\n"
-                  "  }\n"
-      "if(get_local_id(0)==0);\n"
-                  +ARG::name() + "_sum = shared_memory_ptr[0];\n"
-                  "barrier(CLK_LOCAL_MEM_FENCE);\n";
-        }
-    };
-
-  }
-
-}
-
-#endif
-
-
diff --git a/viennacl/generator/make_code/make_code.hpp b/viennacl/generator/make_code/make_code.hpp
deleted file mode 100644
index d4ef013..0000000
--- a/viennacl/generator/make_code/make_code.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef VIENNACL_GENERATOR_MAKE_CODE_MAKE_CODE_HPP
-#define VIENNACL_GENERATOR_MAKE_CODE_MAKE_CODE_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/make_code/make_code.hpp
- *   @brief Convenience header file for the code generation step
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include "viennacl/generator/make_code/inner_product.hpp"
-#include "viennacl/generator/make_code/matrix-vector_product.hpp"
-#include "viennacl/generator/make_code/regular_compound_node.hpp"
-#include "viennacl/generator/make_code/expression.hpp"
-
-#endif
-
-
diff --git a/viennacl/generator/make_code/matrix-vector_product.hpp b/viennacl/generator/make_code/matrix-vector_product.hpp
deleted file mode 100644
index 14d0dfa..0000000
--- a/viennacl/generator/make_code/matrix-vector_product.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef VIENNACL_GENERATOR_MAKE_CODE_MATRIX_VECTOR_PRODUCT_HPP
-#define VIENNACL_GENERATOR_MAKE_CODE_MATRIX_VECTOR_PRODUCT_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/make_code/matrix-vector_product.hpp
- *   @brief Directives for generating code for the matrix-vector product
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/generator/make_code/expression.hpp"
-#include "viennacl/generator/meta_tools/utils.hpp"
-#include "viennacl/generator/meta_tools/typelist.hpp"
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-#include "viennacl/generator/tree_operations.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    template<class T, class OP, class ASSIGNED>
-    struct make_product_code;
-
-    template<class T, class SIZE_DESCRIPTOR, class OP, class ASSIGNED>
-    struct make_product_code<result_of::vector_expression<T,SIZE_DESCRIPTOR>, OP, ASSIGNED> 
-    {
-      private:
-        typedef typename tree_utils::remove_if<T, is_pure_product_leaf>::Result                           SCALAR_EXPR;
-        typedef typename tree_utils::extract_if<T,is_pure_product_leaf>::Result::Head                     ARG;
-        typedef typename generate_tokens<compound_node<NullType,assign_type,SCALAR_EXPR>, false>::Result  Tokens;
-        typedef typename ARG::LHS                                   LHS;
-        typedef typename ARG::RHS                                   RHS;
-        typedef typename result_of::expression_type<LHS>::Result    MatExpr;
-        typedef typename MatExpr::ScalarType                        ScalarType;
-        typedef typename MatExpr::Layout                            Layout;
-
-        static const unsigned int Alignment = result_of::expression_type<LHS>::Result::Alignment;
-        
-        static const std::string assign_res(Int2Type<true>) 
-        {
-          return ASSIGNED::name() + "[ row ]" + OP::expression_string() +  "dot_prod ;";
-        }
-
-        static const std::string assign_res(Int2Type<false>)
-        {
-          return ASSIGNED::name() + "[ row ]" + OP::expression_string() + make_expression_code<SCALAR_EXPR>::value ( "k" ) + "* dot_prod ;";
-        }
-
-        static const std::string expression_string() 
-        {
-          return make_expression_code<LHS>::value() + "*" +  make_expression_code<RHS>::value();
-        }
-
-        static const std::string fill_ith_row(viennacl::row_major)
-        {
-          std::string internal_size_2_expression = MatExpr::internal_size2_expression();
-          if(Alignment==1)
-            return  " dot_prod +=  " + dot_product<LHS,RHS>::value("row *" + internal_size_2_expression + " + col","col") + ";\n";
-          else if (Alignment == 16)
-            return " unsigned int scaled_row = row * " + to_string(Alignment) + ";\n"
-                  +  "dot_prod.s0 +=  " + dot_product<LHS,RHS>::value("scaled_row *" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s1 +=  " + dot_product<LHS,RHS>::value("(scaled_row+1)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s2 +=  " + dot_product<LHS,RHS>::value("(scaled_row+2)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s3 +=  " + dot_product<LHS,RHS>::value("(scaled_row+3)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s4 +=  " + dot_product<LHS,RHS>::value("(scaled_row+4)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s5 +=  " + dot_product<LHS,RHS>::value("(scaled_row+5)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s6 +=  " + dot_product<LHS,RHS>::value("(scaled_row+6)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s7 +=  " + dot_product<LHS,RHS>::value("(scaled_row+7)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s8 +=  " + dot_product<LHS,RHS>::value("(scaled_row+8)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.s9 +=  " + dot_product<LHS,RHS>::value("(scaled_row+9)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.sa +=  " + dot_product<LHS,RHS>::value("(scaled_row+10)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.sb +=  " + dot_product<LHS,RHS>::value("(scaled_row+11)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.sc +=  " + dot_product<LHS,RHS>::value("(scaled_row+12)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.sd +=  " + dot_product<LHS,RHS>::value("(scaled_row+13)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.se +=  " + dot_product<LHS,RHS>::value("(scaled_row+14)*" + internal_size_2_expression + " + col","col") + ";\n"
-                  + "  dot_prod.sf +=  " + dot_product<LHS,RHS>::value("(scaled_row+15)*" + internal_size_2_expression + " + col","col") + ";\n";
-          else
-            return "ALIGNMENT NOT IMPLEMENTED";
-        }
-
-        static const std::string fill_ith_row(viennacl::column_major)
-        {
-          std::string internal_size_1_expression = MatExpr::internal_size1_expression();
-          VIENNACL_STATIC_ASSERT(Alignment==1);
-          return   " dot_prod +=  " + dot_product<LHS,RHS>::value("row  + col * " +  internal_size_1_expression, "col") + ";\n";
-          
-    //       if(Alignment==1)
-    //             return   " dot_prod +=  " + dot_product<LHS,RHS>::value("row  + col * " +  internal_size_1_expression, "col") + ";\n";                    ;
-    //       else
-    //            return "ALIGNMENT NOT IMPLEMENTED";
-    //       
-        }
-        
-      public:
-        static const std::string value() 
-        {
-          return
-              "for (unsigned int row = get_global_id(0) ; row < " + MatExpr::internal_size1_expression() + " ; row += get_global_size(0))\n"
-              "{\n"
-              + print_type<ScalarType,Alignment>::value()+" dot_prod = 0;\n"
-              "for (unsigned int col = 0; col < " + MatExpr::internal_size2_expression() + "; ++col){\n"
-              + fill_ith_row(Layout() )
-              + "}\n"
-              + assign_res ( Int2Type<is_null_type<SCALAR_EXPR>::value>() ) + "\n"
-              + "}\n";
-        }
-
-    };
-      
-    template <class T, class OP, class ASSIGNED>
-    struct make_code<T, OP, ASSIGNED, typename enable_if<is_product_leaf<T> >::type> 
-    {
-      static const std::string value() 
-      {
-        typedef typename result_of::expression_type<T>::Result U;
-        return make_product_code<U,OP,ASSIGNED>::value();
-      }
-    };
-
-  }
-}
-
-#endif
-
-
diff --git a/viennacl/generator/make_code/regular_compound_node.hpp b/viennacl/generator/make_code/regular_compound_node.hpp
deleted file mode 100644
index bb13afd..0000000
--- a/viennacl/generator/make_code/regular_compound_node.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef MAKE_CODE_REGULAR_COMPOUND_NODE_HPP
-#define MAKE_CODE_REGULAR_COMPOUND_NODE_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/make_code/regular_compound_node.hpp
- *  @brief Directives for generating code for the matrix-vector product
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include "expression.hpp"
-#include "viennacl/generator/meta_tools/utils.hpp"
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_matrix.hpp"
-
-namespace viennacl
-{
-  namespace generator
-  {
-
-    template <class T>
-    struct get_loop_bound_impl;
-
-    template <class T, class SIZE_DESCRIPTOR>
-    struct get_loop_bound_impl<result_of::vector_expression<T, SIZE_DESCRIPTOR> > 
-    {
-      static const std::string value() 
-      {
-        return result_of::vector_expression<T, SIZE_DESCRIPTOR>::internal_size_expression();
-      }
-    };
-
-    template <class T, class SIZE1_DESCRIPTOR, class SIZE2_DESCRIPTOR>
-    struct get_loop_bound_impl<result_of::matrix_expression<T, SIZE1_DESCRIPTOR, SIZE2_DESCRIPTOR> > 
-    {
-      private:
-        typedef result_of::matrix_expression<T, SIZE1_DESCRIPTOR, SIZE2_DESCRIPTOR> Arg;
-        
-      public:
-        static const std::string value() 
-        {
-          return Arg::internal_size2_expression() + "*" + Arg::internal_size1_expression();
-        }
-    };
-
-    template <class T>
-    struct get_loop_bound 
-    {
-      static const std::string value() 
-      {
-        return get_loop_bound_impl<typename result_of::expression_type<T>::Result>::value();
-      }
-    };
-
-    template <class T, class ASSIGN_OP, class ASSIGNED, class Enable>
-    struct make_code 
-    {
-      static const std::string value() 
-      {
-        return "for ( unsigned int k = get_global_id(0)"
-               " ; k < " + get_loop_bound<ASSIGNED>::value()
-               +" ; k += get_global_size(0) ) \n"
-               + "{\n"
-               + make_expression_code<ASSIGNED>::value("k") + ASSIGN_OP::expression_string() + make_expression_code<T>::value("k") + ";\n"
-               + "}\n";
-      }
-    };
-
-    template<class T, class ASSIGN_OP, unsigned int ASSIGNED_ID, class ASSIGNED_TYPE>
-    struct make_code<T, ASSIGN_OP, gpu_symbolic_scalar<ASSIGNED_ID,ASSIGNED_TYPE> > 
-    {
-      private:
-        typedef gpu_symbolic_scalar<ASSIGNED_ID,ASSIGNED_TYPE>  ASSIGNED;
-      public:
-        static const std::string value() 
-        {
-          return "if(get_global_id(0) == 0) " 
-                 + make_expression_code<ASSIGNED>::value("0") + '\n' 
-                 + ASSIGN_OP::expression_string() + make_expression_code<T>::value ( "k" ) + ";\n" ;
-        }
-    };
-
-  }
-}
-
-#endif
-
-
diff --git a/viennacl/generator/meta_tools/typelist.hpp b/viennacl/generator/meta_tools/typelist.hpp
deleted file mode 100644
index a5f17bd..0000000
--- a/viennacl/generator/meta_tools/typelist.hpp
+++ /dev/null
@@ -1,386 +0,0 @@
-#ifndef VIENNACL_GENERATOR_META_TOOLS_TYPELIST_HPP
-#define VIENNACL_GENERATOR_META_TOOLS_TYPELIST_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file typelist.hpp
- *  @brief Generic implementation of the typelist
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/generator/meta_tools/utils.hpp"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-
-
-namespace viennacl 
-{
-  namespace generator
-  {
-    template <class T,class U>
-    struct typelist
-    {
-      typedef T Head;
-      typedef U Tail;
-
-      static const std::string name()
-      {
-        return Head::name() + "  ;  " + Tail::name();
-      }
-    };
-
-    namespace typelist_utils 
-    {
-
-      /*
-       * Is Empty
-       */
-
-      template
-      <
-          typename T1  = NullType, typename T2  = NullType, typename T3  = NullType,
-          typename T4  = NullType, typename T5  = NullType, typename T6  = NullType,
-          typename T7  = NullType, typename T8  = NullType, typename T9  = NullType,
-          typename T10 = NullType, typename T11 = NullType, typename T12 = NullType,
-          typename T13 = NullType, typename T14 = NullType, typename T15 = NullType,
-          typename T16 = NullType, typename T17 = NullType, typename T18 = NullType
-      > 
-      struct make_typelist
-      {
-        private:
-          typedef typename make_typelist
-          <
-              T2 , T3 , T4 , 
-              T5 , T6 , T7 , 
-              T8 , T9 , T10, 
-              T11, T12, T13,
-              T14, T15, T16, 
-              T17, T18
-          >
-          ::Result TailResult;
-
-        public:
-          typedef typelist<T1, TailResult> Result;
-      };
-
-      template <>
-      struct make_typelist<>
-      {
-          typedef NullType Result;
-      };
-
-      template <class TList>
-      struct is_empty 
-      {
-          enum { value = 0 };
-      };
-
-      template <>
-      struct is_empty<NullType> 
-      {
-          enum { value = 1 };
-      };
-
-
-      /*
-       * FOREACH
-       */
-
-
-      template <class TList,template<class> class Functor>
-      struct ForEach;
-
-      template <template<class> class Functor>
-      struct ForEach<NullType,Functor> 
-      {
-        static void execute() {}
-        
-        template <class T1>
-        static void execute(T1  & t1) {}
-        
-        template <class T1, class T2>
-        static void execute(T1  & t1, T2  & t2) {}
-        
-        template <class T1, class T2, class T3>
-        static void execute(T1  & t1, T2  & t2, T3 & t3) {}
-        
-        template <class T1, class T2, class T3, class T4>
-        static void execute(T1  & t1, T2  & t2, T3 & t3, T4 & t4) {}
-        
-        template <class T1, class T2, class T3, class T4, class T5>
-        static void execute(T1  & t1, T2  & t2, T3 & t3, T4 & t4, T5 & t5) {}
-          
-      };
-
-      template <class T, class U,template<class> class Functor>
-      struct ForEach< typelist<T, U>, Functor >
-      {
-        static void execute()
-        {
-          Functor<T>::execute();
-          ForEach<U, Functor>::execute();
-        }
-        
-        template <class T1>
-        static void execute(T1  & t1)
-        {
-          Functor<T>::execute(t1);
-          ForEach<U,Functor>::execute(t1);
-        }
-        
-        template <class T1, class T2>
-        static void execute(T1  & t1, T2  &t2)
-        {
-          Functor<T>::execute(t1,t2);
-          ForEach<U,Functor>::execute(t1,t2);
-        }
-        
-        template <class T1, class T2, class T3>
-        static void execute(T1  & t1, T2  & t2, T3 & t3)
-        {
-          Functor<T>::execute(t1,t2,t3);
-          ForEach<U,Functor>::execute(t1,t2,t3);
-        }
-        
-        template <class T1, class T2, class T3, class T4>
-        static void execute(T1  & t1, T2  & t2, T3 & t3, T4 & t4)
-        {
-          Functor<T>::execute(t1,t2,t3,t4);
-          ForEach<U,Functor>::execute(t1,t2,t3,t4);
-        }
-        
-        template <class T1, class T2, class T3, class T4, class T5>
-        static void execute(T1  & t1, T2  & t2, T3 & t3, T4 & t4, T5 & t5)
-        {
-          Functor<T>::execute(t1,t2,t3,t4,t5);
-          ForEach<U,Functor>::execute(t1,t2,t3,t4,t5);
-        }
-      };
-
-
-      /*
-       * length
-       */
-
-
-      template <class TList>
-      struct length;
-
-      template <>
-      struct length<NullType> 
-      {
-        enum { value = 0 };
-      };
-
-      template <class T, class U>
-      struct length< typelist<T, U> > 
-      {
-        enum { value = 1 + length<U>::value };
-      };
-
-      /*
-       * type_at
-       */
-
-      template <class TList, unsigned int i>
-      struct type_at;
-
-      template <class Head, class Tail>
-      struct type_at<typelist<Head, Tail>, 0> 
-      {
-        typedef Head Result;
-      };
-
-      template <class Head, class Tail, unsigned int i>
-      struct type_at<typelist<Head, Tail>, i> 
-      {
-        typedef typename type_at<Tail, i - 1>::Result Result;
-      };
-
-      /*
-       * index_of
-       */
-
-      template <class TList, class T>
-      struct index_of;
-
-      template <class T>
-      struct index_of<NullType, T> 
-      {
-        enum { value = -1 };
-      };
-
-      template <class T, class Tail>
-      struct index_of<typelist<T, Tail>, T> 
-      {
-        enum { value = 0 };
-      };
-
-      template <class Head, class Tail, class T>
-      struct index_of<typelist<Head, Tail>, T> 
-      {
-        private:
-          enum { temp = index_of<Tail, T>::value };
-          
-        public:
-          enum { value = temp == -1 ? -1 : 1 + temp };
-      };
-
-      /*
-       * append
-       */
-
-      template <class T1, class T2>
-      struct compare1 
-      {
-        enum { value = static_cast<int> ( T1::id ) < static_cast<int> ( T2::id ) };
-      };
-
-      template <class T>
-      struct compare1<NullType, T> 
-      {
-        enum { value = 0 };
-      };
-
-
-      template <class T1, class T2>
-      struct true_comp 
-      {
-        enum { value = 1 };
-      };
-
-      template <class TList, class T, template<class,class> class Compare = true_comp>
-      struct append;
-
-      template <template<class,class> class Compare>
-      struct append<NullType, NullType, Compare> 
-      {
-        typedef NullType Result;
-      };
-
-      template <class T, template<class,class> class Compare>
-      struct append<NullType, T, Compare> 
-      {
-        typedef typelist<T,NullType> Result;
-      };
-
-      template <class Head, class Tail, template<class,class> class Compare>
-      struct append<NullType, typelist<Head, Tail>, Compare > 
-      {
-        typedef typelist<Head, Tail> Result;
-      };
-
-      template <class Head, class Tail, template<class,class> class Compare>
-      struct append<typelist<Head, Tail>, NullType, Compare > 
-      {
-        typedef typelist<Head, Tail> Result;
-      };
-
-      template <class Head, class Tail, class T, template<class,class> class Compare>
-      struct append<typelist<Head,Tail>, T, Compare> 
-      {
-        private:
-          typedef typelist<Head, typename append<Tail, T, Compare>::Result> TypeCompareFalse;
-          typedef typelist<T, typelist<Head,Tail> > TypeCompareTrue;
-          
-        public:
-          typedef typename get_type_if<TypeCompareTrue,TypeCompareFalse,Compare<T,Head>::value >::Result Result;
-      };
-
-      /*
-       * fuse
-       */
-
-      template <class TList, class T, template<class,class> class Compare = true_comp>
-      struct fuse 
-      {
-        typedef typename append<TList, T, Compare>::Result Result;
-      };
-
-      template <class Head1, class Tail1, class Head2, class Tail2, template<class,class> class Compare >
-      struct fuse<typelist<Head1, Tail1>, typelist<Head2,Tail2>, Compare > 
-      {
-        private:
-          typedef typename append< typelist<Head1,Tail1> , Head2, Compare>::Result NewResult;
-
-        public:
-          typedef typename fuse< NewResult, Tail2, Compare >::Result Result;
-      };
-
-      /*
-       * erase
-       */
-
-
-      template<class TList, class T>
-      struct erase;
-
-      template <class T>
-      struct erase<NullType, T> 
-      {
-        typedef NullType Result;
-      };
-
-      template <class T, class Tail>
-      struct erase<typelist<T, Tail>, T>
-      {
-        typedef Tail Result;
-      };
-
-      template <class Head, class Tail, class T>
-      struct erase<typelist<Head, Tail>, T> 
-      {
-        typedef typelist<Head,
-                         typename erase<Tail, T>::Result>        Result;
-      };
-
-      template <class Head, class Tail, class Head2, class Tail2>
-      struct erase<typelist<Head, Tail>, typelist<Head2, Tail2> > 
-      {
-        typedef typename erase< typename erase<typelist<Head,Tail>, Head2>::Result, Tail2 >::Result Result;
-      };
-
-      /*
-       * No duplicate
-       */
-
-      template<class TList>
-      struct no_duplicates;
-
-      template <>
-      struct no_duplicates<NullType> 
-      {
-        typedef NullType Result;
-      };
-
-      template <class Head, class Tail>
-      struct no_duplicates< typelist<Head, Tail> > 
-      {
-        private:
-          typedef typename no_duplicates<Tail>::Result L1;
-          typedef typename erase<L1, Head>::Result L2;
-          
-        public:
-          typedef typelist<Head, L2> Result;
-      };
-
-    }
-  }
-}
-
-#endif
diff --git a/viennacl/generator/meta_tools/utils.hpp b/viennacl/generator/meta_tools/utils.hpp
deleted file mode 100644
index 28cf935..0000000
--- a/viennacl/generator/meta_tools/utils.hpp
+++ /dev/null
@@ -1,290 +0,0 @@
-#ifndef VIENNACL_GENERATOR_META_TOOLS_UTILS_HPP
-#define VIENNACL_GENERATOR_META_TOOLS_UTILS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/meta_tools/utils.hpp
- *  @brief Various metaprogramming utilities
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include <algorithm>
-#include <typeinfo>
-#include <iostream>
-#include "viennacl/matrix.hpp"
-
-#define VIENNACL_STATIC_ASSERT( x ) typedef char __STATIC_ASSERT__[( x )?1:-1]
-
-namespace viennacl 
-{
-
-  class any;
-
-  template<class T>
-  T any_cast(any& a);
-
-  class value_base
-  {
-    public:
-      virtual ~value_base() { }
-      virtual value_base* clone() const = 0;
-      virtual std::type_info const & type() const = 0;
-  };
-
-  template <class T>
-  class value : public value_base
-  {
-      friend T any_cast<>(any& a);
-      
-      T t;
-      
-    public:
-      value(const T& t_) : t(t_) { }
-      value_base* clone() const
-      {
-          return new value(t);
-      }
-      
-      std::type_info const &type() const {
-          return typeid(T);
-      }
-  };  
-
-  class any
-  {
-      template<class T>
-      friend T any_cast(any & a);
-      
-      value_base* v;
-      
-    public:
-      any() : v(0) { }
-      
-      template <class value_type>
-      any(const value_type& v_) : v(new value<value_type>(v_)) { }
-
-      any(any const & other) : v(other.v ? other.v->clone() : 0) {}
-
-      any& operator=(const any& other)
-      {
-          if(&other != this)
-          {
-              any copy(other);
-              swap(copy);
-          }
-          return *this;
-      }
-
-      void swap(any& other)
-      {
-          std::swap(v, other.v);
-      }
-
-      std::type_info const & type()
-      {
-        return v->type();
-      }
-      
-      ~any() { delete v; }
-  };
-
-  class bad_any_cast : public std::bad_cast
-  {
-    public:
-      virtual const char * what() const throw()
-      {
-          return "viennacl::bad_any_cast: "
-                 "failed conversion using viennacl::any_cast";
-      }
-  };
-
-  template <class T>
-  T any_cast(any& a)
-  {
-    value<T>* v = dynamic_cast<value<T>*>(a.v);
-    
-    if(v == 0)
-      throw bad_any_cast();
-    else
-      return v->t;
-  }
-
-
-  namespace generator
-  {
-    struct NullType 
-    {
-      static const std::string name() 
-      {
-          return "Null\n" ;
-      }
-    };
-
-    template <class T>
-    inline std::string to_string ( T const t ) 
-    {
-      std::stringstream ss;
-      ss << t;
-      return ss.str();
-    }
-
-    inline std::string to_string(viennacl::row_major    const) { return "rowmajor"; }
-    inline std::string to_string(viennacl::column_major const) { return "columnmajor"; }
-
-
-    template <int v>
-    struct Int2Type 
-    {
-      enum { value = v };
-    };
-
-    template<class TypeTrue, class TypeFalse, bool cond>
-    struct get_type_if 
-    {
-      typedef TypeTrue Result;
-    };
-
-    template<class TypeTrue, class TypeFalse>
-    struct get_type_if<TypeTrue, TypeFalse, false> 
-    {
-      typedef TypeFalse Result;
-    };
-
-    template<class T, class U>
-    struct are_same_type
-    {
-      enum { value = 0 };
-    };
-
-    template<class T>
-    struct are_same_type<T,T>
-    {
-      enum { value = 1 };
-    };
-
-
-        
-    template <bool B, class T = void>
-    struct enable_if_c 
-    {
-      typedef T type;
-    };
-
-    template <class T>
-    struct enable_if_c<false, T> {};
-
-    template <class Cond, class T = void>
-    struct enable_if : public enable_if_c<Cond::value, T> {};
-
-
-    template <bool B, class T = void>
-    struct disable_if_c 
-    {
-      typedef T type;
-    };
-
-    template <class T>
-    struct disable_if_c<true, T> {};
-
-    template <class Cond, class T = void>
-    struct disable_if : public disable_if_c<Cond::value, T> {};
-
-
-
-    template<class T>
-    struct print_align1_type;
-
-    template<>
-    struct print_align1_type<int> 
-    {
-      static const std::string value() { return "int"; }
-    };
-
-    template<>
-    struct print_align1_type<unsigned int>
-    {
-      static const std::string value() { return "unsigned int"; }
-    };
-
-    template<>
-    struct print_align1_type<long> 
-    {
-      static const std::string value() { return "long"; }
-    };
-
-    template<>
-    struct print_align1_type<unsigned long> 
-    {
-      static const std::string value() { return "long"; }
-    };
-
-    template<>
-    struct print_align1_type<float> 
-    {
-      static const std::string value() { return "float"; }
-    };
-
-    template<>
-    struct print_align1_type<double> 
-    {
-      static const std::string value() { return "double"; }
-    };
-
-    template<typename T, unsigned int ALIGNMENT>
-    struct print_aligned_type 
-    {
-	    static const std::string value() 
-	    {
-        return print_align1_type<T>::value() + to_string ( ALIGNMENT );
-      }
-    };
-
-    template<typename T>
-    struct print_aligned_type<T, 1>
-    {
-	    static const std::string value() 
-	    {
-        return print_align1_type<T>::value();
-      }
-    };
-
-    template<typename T, unsigned int ALIGNMENT>
-    struct print_type 
-    {
-      static const std::string value() 
-      {
-        return print_aligned_type<T,ALIGNMENT>::value();
-      }
-    };
-
-    template<typename T, unsigned int ALIGNMENT>
-    struct print_type<T*, ALIGNMENT> 
-    {
-      static const std::string value() 
-      {
-        return print_type<T,ALIGNMENT>::value() + "*" ;
-      }
-    };
-
-  }
-}
-
-#endif
-
-
diff --git a/viennacl/generator/operation_types.hpp b/viennacl/generator/operation_types.hpp
deleted file mode 100644
index f946a62..0000000
--- a/viennacl/generator/operation_types.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef VIENNACL_GENERATOR_OPERATION_TYPES_HPP
-#define VIENNACL_GENERATOR_OPERATION_TYPES_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/operation_types.hpp
- *   @brief Declaration of the types related to the operators
- */
-
-#include <sstream>
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    struct assign_type 
-    {
-      static const std::string expression_string() { return " = "; }
-      static const std::string name() { return "eq"; }
-    };
-
-    struct add_type 
-    {
-      static const std::string expression_string() { return " + "; }
-      static const std::string name() { return "p"; }
-    };
-
-    struct inplace_add_type 
-    {
-      static const std::string expression_string() { return " += "; }
-      static const std::string name() { return "p_eq"; }
-    };
-
-    struct sub_type 
-    {
-      static const std::string expression_string() { return " - "; }
-      static const std::string name() { return "m"; }
-    };
-
-    struct inplace_sub_type 
-    {
-      static const std::string expression_string() { return " -= "; }
-      static const std::string name() { return "m_eq"; }
-    };
-
-    struct scal_mul_type 
-    {
-      static const std::string expression_string() { return " * "; }
-      static const std::string name() { return "mu"; }
-    };
-
-    struct inplace_scal_mul_type 
-    {
-      static const std::string expression_string() { return " *= "; }
-      static const std::string name() { return "mu_eq"; }
-    };
-
-
-    struct scal_div_type
-    {
-      static const std::string expression_string() { return " / "; }
-      static const std::string name() { return "d"; }
-    };
-
-    struct inplace_scal_div_type 
-    {
-      static const std::string expression_string() { return " /= "; }
-      static const std::string name() { return "d_eq"; }
-    };
-
-    struct inner_prod_type 
-    {
-      static const std::string expression_string() { return "_i_"; }
-      static const std::string name() { return "i"; }
-    };
-
-    struct prod_type 
-    {
-      static const std::string expression_string() { return "_p_"; }
-      static const std::string name() { return "p"; }
-    };
-
-    template<class T>
-    struct make_inplace 
-    {
-      typedef T Result;
-    };
-
-    template<>
-    struct make_inplace<add_type> 
-    {
-      typedef inplace_add_type Result;
-    };
-
-    template<>
-    struct make_inplace<sub_type> 
-    {
-      typedef inplace_sub_type Result;
-    };
-
-    template<>
-    struct make_inplace<scal_mul_type> 
-    {
-      typedef inplace_scal_mul_type Result;
-    };
-
-    template<>
-    struct make_inplace<scal_div_type> 
-    {
-      typedef inplace_scal_div_type Result;
-    };
-
-  }
-}
-#endif
diff --git a/viennacl/generator/symbolic_types/convenience_typedef.hpp b/viennacl/generator/symbolic_types/convenience_typedef.hpp
deleted file mode 100644
index 4929f3e..0000000
--- a/viennacl/generator/symbolic_types/convenience_typedef.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-#ifndef VIENNACL_GENERATOR_SYMBOLIC_TYPES_CONVENIENCE_TYPEDEF_HPP
-#define VIENNACL_GENERATOR_SYMBOLIC_TYPES_CONVENIENCE_TYPEDEF_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file convenience_typedef.hpp
- *  @brief Convenience typedefs for quick creation of symbolic types
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include "viennacl/generator/symbolic_types/symbolic_vector.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_matrix.hpp"
-#include "viennacl/generator/symbolic_types/symbolic_scalars.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    //Symbolic vectors : float
-    
-    typedef symbolic_vector<0,float,1> symv_0_f;
-    typedef symbolic_vector<1,float,1> symv_1_f;
-    typedef symbolic_vector<2,float,1> symv_2_f;
-    typedef symbolic_vector<3,float,1> symv_3_f;
-    typedef symbolic_vector<4,float,1> symv_4_f;
-    typedef symbolic_vector<5,float,1> symv_5_f;
-    typedef symbolic_vector<6,float,1> symv_6_f;
-
-    typedef symbolic_vector<0,float,4> symv_0_f_4;
-    typedef symbolic_vector<1,float,4> symv_1_f_4;
-    typedef symbolic_vector<2,float,4> symv_2_f_4;
-    typedef symbolic_vector<3,float,4> symv_3_f_4;
-    typedef symbolic_vector<4,float,4> symv_4_f_4;
-    typedef symbolic_vector<5,float,4> symv_5_f_4;
-    typedef symbolic_vector<6,float,4> symv_6_f_4;
-
-    typedef symbolic_vector<0,float,16> symv_0_f_16;
-    typedef symbolic_vector<1,float,16> symv_1_f_16;
-    typedef symbolic_vector<2,float,16> symv_2_f_16;
-    typedef symbolic_vector<3,float,16> symv_3_f_16;
-    typedef symbolic_vector<4,float,16> symv_4_f_16;
-    typedef symbolic_vector<5,float,16> symv_5_f_16;
-    typedef symbolic_vector<6,float,16> symv_6_f_16;
-
-
-    //Symbolic vectors : double
-
-    typedef symbolic_vector<0,double,1> symv_0_d;
-    typedef symbolic_vector<1,double,1> symv_1_d;
-    typedef symbolic_vector<2,double,1> symv_2_d;
-    typedef symbolic_vector<3,double,1> symv_3_d;
-    typedef symbolic_vector<4,double,1> symv_4_d;
-    typedef symbolic_vector<5,double,1> symv_5_d;
-    typedef symbolic_vector<6,double,1> symv_6_d;
-
-    typedef symbolic_vector<0,double,4> symv_0_d_4;
-    typedef symbolic_vector<1,double,4> symv_1_d_4;
-    typedef symbolic_vector<2,double,4> symv_2_d_4;
-    typedef symbolic_vector<3,double,4> symv_3_d_4;
-    typedef symbolic_vector<4,double,4> symv_4_d_4;
-    typedef symbolic_vector<5,double,4> symv_5_d_4;
-    typedef symbolic_vector<6,double,4> symv_6_d_4;
-
-    typedef symbolic_vector<0,double,16> symv_0_d_16;
-    typedef symbolic_vector<1,double,16> symv_1_d_16;
-    typedef symbolic_vector<2,double,16> symv_2_d_16;
-    typedef symbolic_vector<3,double,16> symv_3_d_16;
-    typedef symbolic_vector<4,double,16> symv_4_d_16;
-    typedef symbolic_vector<5,double,16> symv_5_d_16;
-    typedef symbolic_vector<6,double,16> symv_6_d_16;
-
-
-    //Symbolic matrices : float
-
-    typedef symbolic_matrix<0,float,viennacl::row_major,1> symm_0_f;
-    typedef symbolic_matrix<1,float,viennacl::row_major,1> symm_1_f;
-    typedef symbolic_matrix<2,float,viennacl::row_major,1> symm_2_f;
-    typedef symbolic_matrix<3,float,viennacl::row_major,1> symm_3_f;
-    typedef symbolic_matrix<4,float,viennacl::row_major,1> symm_4_f;
-    typedef symbolic_matrix<5,float,viennacl::row_major,1> symm_5_f;
-    typedef symbolic_matrix<6,float,viennacl::row_major,1> symm_6_f;
-
-    typedef symbolic_matrix<0,float,viennacl::row_major,16> symm_0_f_r_16;
-    typedef symbolic_matrix<1,float,viennacl::row_major,16> symm_1_f_r_16;
-    typedef symbolic_matrix<2,float,viennacl::row_major,16> symm_2_f_r_16;
-    typedef symbolic_matrix<3,float,viennacl::row_major,16> symm_3_f_r_16;
-    typedef symbolic_matrix<4,float,viennacl::row_major,16> symm_4_f_r_16;
-    typedef symbolic_matrix<5,float,viennacl::row_major,16> symm_5_f_r_16;
-    typedef symbolic_matrix<6,float,viennacl::row_major,16> symm_6_f_r_16;
-
-
-    //Symbolic matrices : double
-
-    typedef symbolic_matrix<0,double,viennacl::row_major,1> symm_0_d;
-    typedef symbolic_matrix<1,double,viennacl::row_major,1> symm_1_d;
-    typedef symbolic_matrix<2,double,viennacl::row_major,1> symm_2_d;
-    typedef symbolic_matrix<3,double,viennacl::row_major,1> symm_3_d;
-    typedef symbolic_matrix<4,double,viennacl::row_major,1> symm_4_d;
-    typedef symbolic_matrix<5,double,viennacl::row_major,1> symm_5_d;
-    typedef symbolic_matrix<6,double,viennacl::row_major,1> symm_6_d;
-
-    typedef symbolic_matrix<0,double,viennacl::row_major,16> symm_0_d_r_16;
-    typedef symbolic_matrix<1,double,viennacl::row_major,16> symm_1_d_r_16;
-    typedef symbolic_matrix<2,double,viennacl::row_major,16> symm_2_d_r_16;
-    typedef symbolic_matrix<3,double,viennacl::row_major,16> symm_3_d_r_16;
-    typedef symbolic_matrix<4,double,viennacl::row_major,16> symm_4_d_r_16;
-    typedef symbolic_matrix<5,double,viennacl::row_major,16> symm_5_d_r_16;
-    typedef symbolic_matrix<6,double,viennacl::row_major,16> symm_6_d_r_16;
-
-
-    //CPU Symbolic scalar: float
-
-    typedef cpu_symbolic_scalar<0,float> c_syms_0_f;
-    typedef cpu_symbolic_scalar<1,float> c_syms_1_f;
-    typedef cpu_symbolic_scalar<2,float> c_syms_2_f;
-    typedef cpu_symbolic_scalar<3,float> c_syms_3_f;
-    typedef cpu_symbolic_scalar<4,float> c_syms_4_f;
-    typedef cpu_symbolic_scalar<5,float> c_syms_5_f;
-    typedef cpu_symbolic_scalar<6,float> c_syms_6_f;
-
-
-    //CPU Symbolic scalar: double
-
-    typedef cpu_symbolic_scalar<0,double> c_syms_0_d;
-    typedef cpu_symbolic_scalar<1,double> c_syms_1_d;
-    typedef cpu_symbolic_scalar<2,double> c_syms_2_d;
-    typedef cpu_symbolic_scalar<3,double> c_syms_3_d;
-    typedef cpu_symbolic_scalar<4,double> c_syms_4_d;
-    typedef cpu_symbolic_scalar<5,double> c_syms_5_d;
-    typedef cpu_symbolic_scalar<6,double> c_syms_6_d;
-
-
-    //GPU Symbolic scalar: float
-
-    typedef gpu_symbolic_scalar<0,float> syms_0_f;
-    typedef gpu_symbolic_scalar<1,float> syms_1_f;
-    typedef gpu_symbolic_scalar<2,float> syms_2_f;
-    typedef gpu_symbolic_scalar<3,float> syms_3_f;
-    typedef gpu_symbolic_scalar<4,float> syms_4_f;
-    typedef gpu_symbolic_scalar<5,float> syms_5_f;
-    typedef gpu_symbolic_scalar<6,float> syms_6_f;
-
-
-    //GPU Symbolic scalar: double
-
-    typedef gpu_symbolic_scalar<0,double> syms_0_d;
-    typedef gpu_symbolic_scalar<1,double> syms_1_d;
-    typedef gpu_symbolic_scalar<2,double> syms_2_d;
-    typedef gpu_symbolic_scalar<3,double> syms_3_d;
-    typedef gpu_symbolic_scalar<4,double> syms_4_d;
-    typedef gpu_symbolic_scalar<5,double> syms_5_d;
-    typedef gpu_symbolic_scalar<6,double> syms_6_d;
-
-
-  }
-}
-
-#endif
-
-
diff --git a/viennacl/generator/symbolic_types/symbolic_matrix.hpp b/viennacl/generator/symbolic_types/symbolic_matrix.hpp
deleted file mode 100644
index 3f474f6..0000000
--- a/viennacl/generator/symbolic_types/symbolic_matrix.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-#ifndef VIENNACL_GENERATOR_SYMBOLIC_TYPES_SYMBOLIC_MATRIX_HPP
-#define VIENNACL_GENERATOR_SYMBOLIC_TYPES_SYMBOLIC_MATRIX_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/symbolic_types/symbolic_matrix.hpp
- *  @brief Implementation of a symbolic matrix type
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-
-#include "viennacl/forwards.h"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/meta_tools/utils.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    /**
-    * @brief Symbolic matrix type
-    * 
-    * @tparam ID The argument ID of the matrix in the generated code
-    * @tparam SCALARTYPE The Scalartype of the matrix in the generated code
-    * @tparam F The Layout of the matrix in the generated code
-    * @tparam ALIGNMENT The Alignment of the matrix in the generated code
-    */
-    template<unsigned int ID, typename SCALARTYPE, class F, unsigned int ALIGNMENT>
-    class symbolic_matrix 
-    {
-        typedef symbolic_matrix<ID, SCALARTYPE, F, ALIGNMENT> self_type;
-
-      public:
-
-        enum { id = ID };
-
-        typedef SCALARTYPE ScalarType;
-
-        typedef F Layout;
-        
-        static const unsigned int Alignment = ALIGNMENT;
-        
-        typedef viennacl::matrix<ScalarType,F,Alignment> runtime_type;
-        
-        static const std::string name()
-        {
-          F layout;
-          return "m_a_" + viennacl::generator::to_string(layout) + "_" 
-                        + viennacl::generator::to_string(Alignment) + "_"
-                        + viennacl::generator::to_string<long>(id);
-        }
-
-        static const std::string size1_name() 
-        {
-          return "size1_" + name();
-        }
-
-        static const std::string size2_name() 
-        {
-          return "size2_" + name();
-        }
-
-        static const std::string internal_size1_name() 
-        {
-          return "internal_size1_" + name();
-        }
-
-        static const std::string internal_size2_name() 
-        {
-          return "internal_size2_" + name();
-        }
-
-        static const std::string kernel_arguments() 
-        {
-          return " __global " + generator::print_type<SCALARTYPE*,Alignment>::value() + " " + name()
-                + ", unsigned int " + size1_name()
-                + ", unsigned int " + size2_name()
-                + ", unsigned int " + internal_size1_name()
-                + ", unsigned int " + internal_size2_name()
-                + "\n";
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<generator::is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, assign_type, RHS_TYPE > >::type
-        operator= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,assign_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<generator::is_scalar_expression<RHS_TYPE>,
-                           compound_node<self_type, inplace_scal_mul_type, RHS_TYPE > >::type
-        operator*= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_scal_mul_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<generator::is_scalar_expression<RHS_TYPE>,
-                           compound_node<self_type, inplace_scal_div_type, RHS_TYPE > >::type
-        operator/= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_scal_div_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<generator::is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, inplace_add_type, RHS_TYPE > >::type
-        operator+= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_add_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<generator::is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, inplace_sub_type, RHS_TYPE > >::type
-        operator-= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_sub_type,RHS_TYPE >();
-        }
-
-        operator compound_node<self_type,assign_type,self_type>() 
-        {
-          return compound_node<self_type,assign_type,self_type>();
-        }
-    };
-
-    template<unsigned int ID,typename SCALARTYPE, class F, unsigned int ALIGNMENT>
-    class tmp_symbolic_matrix<symbolic_matrix<ID,SCALARTYPE,F,ALIGNMENT> > {};
-    
-  } // namespace generator
-} // namespace viennacl
-
-#endif
-
-
diff --git a/viennacl/generator/symbolic_types/symbolic_scalars.hpp b/viennacl/generator/symbolic_types/symbolic_scalars.hpp
deleted file mode 100644
index 0776da1..0000000
--- a/viennacl/generator/symbolic_types/symbolic_scalars.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-#ifndef VIENNACL_GENERATOR_SYMBOLIC_TYPES_SYMBOLIC_SCALARS_HPP
-#define VIENNACL_GENERATOR_SYMBOLIC_TYPES_SYMBOLIC_SCALARS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/symbolic_types/symbolic_scalars.hpp
- *  @brief Implementation of the symbolic scalar types.
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    ///////////////////////////////////////
-    /////// REGULAR SYM SCALARS //////////
-    //////////////////////////////////////
-
-    /**
-    * @brief Symbolic scalar type. Will be passed by value.
-    * 
-    * @tparam ID The argument ID of the scalar in the generated code
-    * @tparam SCALARTYPE The Scalartype of the scalar in the generated code
-    */
-    template <unsigned int ID, typename SCALARTYPE>
-    class cpu_symbolic_scalar
-    {
-      private:
-        typedef cpu_symbolic_scalar<ID,SCALARTYPE> self_type;
-
-      public:
-
-        typedef SCALARTYPE ScalarType;
-
-        typedef ScalarType runtime_type;
-        
-        enum { id = ID };
-
-        static const std::string name() 
-        {
-          std::ostringstream oss;
-          oss << "c_s" << ID ;
-          return oss.str();
-        }
-
-        static const std::string kernel_arguments() 
-        {
-          return print_type<SCALARTYPE,1>::value() + " " + name() + "\n";
-        }
-    };
-
-    /**
-     * @brief Symbolic scalar type. Will be passed by pointer.
-     * 
-     * @tparam ID The argument ID of the scalar in the generated code
-     * @tparam SCALARTYPE The Scalartype of the scalar in the generated code
-     */
-    template <unsigned int ID, typename SCALARTYPE>
-    class gpu_symbolic_scalar 
-    {
-      private:
-        typedef gpu_symbolic_scalar<ID,SCALARTYPE> self_type;
-
-      public:
-
-        typedef SCALARTYPE ScalarType;
-
-        typedef viennacl::scalar<ScalarType> runtime_type;
-        
-        enum { id = ID };
-
-        static const std::string name() 
-        {
-          std::ostringstream oss;
-          oss << "g_s" << ID ;
-          return oss.str();
-        }
-
-        static const std::string kernel_arguments() 
-        {
-          return "__global " + print_type<SCALARTYPE*,1>::value() + " " + name() + "\n" ;
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, assign_type, RHS_TYPE > >::type
-        operator= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,assign_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_scalar_expression<RHS_TYPE>,
-                           compound_node<self_type, inplace_scal_mul_type, RHS_TYPE > >::type
-        operator*= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_scal_mul_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_scalar_expression<RHS_TYPE>,
-                           compound_node<self_type, inplace_scal_div_type, RHS_TYPE > >::type
-        operator/= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_scal_div_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, inplace_add_type, RHS_TYPE > >::type
-        operator+= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_add_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, inplace_sub_type, RHS_TYPE > >::type
-        operator-= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_sub_type,RHS_TYPE >();
-        }
-
-        operator compound_node<self_type,assign_type,self_type>() 
-        {
-          return compound_node<self_type,assign_type,self_type>();
-        }
-    };
-
-    ///////////////////////////////////////
-    ///////// SCALAR MULTIPLICATION ///////
-    //////////////////////////////////////
-
-    /** @brief Scalar multiplication operator */
-    template<class LHS_TYPE, class RHS_TYPE>
-    typename enable_if_c<is_scalar_expression<LHS_TYPE>::value || is_scalar_expression<RHS_TYPE>::value,
-                         compound_node<LHS_TYPE,scal_mul_type,RHS_TYPE> >::type
-    operator* ( LHS_TYPE const & lhs, RHS_TYPE const & rhs ) 
-    {
-      return compound_node<LHS_TYPE, scal_mul_type,RHS_TYPE> ();
-    }
-
-    /** @brief Scalar division operator */
-    template<class LHS_TYPE, class RHS_TYPE>
-    typename enable_if_c< is_scalar_expression<RHS_TYPE>::value,
-                          compound_node<LHS_TYPE,scal_div_type,RHS_TYPE> > ::type
-    operator/ ( LHS_TYPE const & lhs, RHS_TYPE const & rhs ) 
-    {
-      return compound_node<LHS_TYPE,scal_div_type,RHS_TYPE> ();
-    }
-
-  }
-}
-#endif
diff --git a/viennacl/generator/symbolic_types/symbolic_vector.hpp b/viennacl/generator/symbolic_types/symbolic_vector.hpp
deleted file mode 100644
index b4c1a16..0000000
--- a/viennacl/generator/symbolic_types/symbolic_vector.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef VIENNACL_GENERATOR_SYMBOLIC_TYPES_SYMBOLIC_VECTOR_HPP
-#define VIENNACL_GENERATOR_SYMBOLIC_TYPES_SYMBOLIC_VECTOR_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/symbolic_types/symbolic_vector.hpp
- *  @brief Implementation of a symbolic vector type
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/vector.hpp"
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-#include "viennacl/generator/traits/result_of.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    /**
-     * @brief Symbolic vector type
-     * 
-     * @tparam ID The argument ID of the vector in the generated code
-     * @tparam SCALARTYPE The Scalartype of the vector in the generated code
-     * @tparam ALIGNMENT The Alignment of the vector in the generated code
-     */
-    template <unsigned int ID, typename SCALARTYPE, unsigned int ALIGNMENT>
-    class symbolic_vector 
-    {
-      private:
-        typedef symbolic_vector<ID,SCALARTYPE,ALIGNMENT> self_type;
-
-      public:
-	
-        typedef SCALARTYPE ScalarType;
-        
-        static const unsigned int Alignment = ALIGNMENT;
-
-        typedef viennacl::vector<ScalarType,Alignment> runtime_type;
-
-        static const unsigned int id = ID;
-
-        static const std::string name() 
-        {
-          return "v_a" + to_string(Alignment) + "_" + to_string(ID);
-        }
-
-        static const std::string size2_name() 
-        {
-          return "size_"+name();
-        }
-
-        static const std::string internal_size2_name() 
-        {
-          return "internal_size_"+name();
-        }
-        
-        static const std::string name_argument() 
-        {
-          return " __global " + print_type<SCALARTYPE*,Alignment>::value() + " " + name();
-        }
-
-        static const std::string kernel_arguments() 
-        {
-          return " __global " + print_type<SCALARTYPE*,Alignment>::value() + " " + name() 
-               + ", unsigned int " + size2_name() 
-               + ", unsigned int " + internal_size2_name() + "\n" ;
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, assign_type, RHS_TYPE > >::type
-        operator= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,assign_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_scalar_expression<RHS_TYPE>,
-                           compound_node<self_type, inplace_scal_mul_type, RHS_TYPE > >::type
-        operator*= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_scal_mul_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_scalar_expression<RHS_TYPE>,
-                           compound_node<self_type, inplace_scal_div_type, RHS_TYPE > >::type
-        operator/= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_scal_div_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, inplace_add_type, RHS_TYPE > >::type
-        operator+= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_add_type,RHS_TYPE >();
-        }
-
-        template<typename RHS_TYPE>
-        typename enable_if<is_same_expression_type<self_type,RHS_TYPE>,
-                           compound_node<self_type, inplace_sub_type, RHS_TYPE > >::type
-        operator-= ( RHS_TYPE const & rhs ) const 
-        {
-          return compound_node<self_type,inplace_sub_type,RHS_TYPE >();
-        }
-
-        operator compound_node<self_type,assign_type,self_type>() 
-        {
-          return compound_node<self_type,assign_type,self_type>();
-        }
-    };
-
-    template< unsigned int ID,class SCALARTYPE,unsigned int ALIGNMENT>
-    class tmp_symbolic_vector<symbolic_vector<ID,SCALARTYPE,ALIGNMENT> > 
-    {
-        typedef symbolic_vector<ID,SCALARTYPE,ALIGNMENT> ARG;
-
-      public:
-        typedef SCALARTYPE ScalarType;
-
-        typedef typename symbolic_vector<ID,SCALARTYPE,ALIGNMENT>::runtime_type runtime_type;
-        
-        static const unsigned int Alignment = ALIGNMENT;
-
-        static const unsigned int id = ID;
-
-
-        static const std::string name() 
-        {
-          return "tmp_" + ARG::name();
-        }
-
-        static const std::string size2_name() 
-        {
-          return "size_"+name();
-        }
-
-        static const std::string internal_size2_name() 
-        {
-          return "internal_size_"+name();
-        }
-        
-        static const std::string name_argument() 
-        {
-          return " __global " + print_type<SCALARTYPE*,Alignment>::value() + " " + name();
-        }
-
-        static const std::string kernel_arguments() 
-        {
-            return " __global " + print_type<SCALARTYPE*,Alignment>::value() + " " + name() 
-                 + ", unsigned int " + size2_name() 
-                 + ", unsigned int " + internal_size2_name() + "\n" ;
-        }
-    };
-  }
-}
-
-#endif
-
diff --git a/viennacl/generator/tokens_management.hpp b/viennacl/generator/tokens_management.hpp
deleted file mode 100644
index 5a45597..0000000
--- a/viennacl/generator/tokens_management.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef VIENNACL_GENERATOR_TOKENS_MANAGEMENT_HPP
-#define VIENNACL_GENERATOR_TOKENS_MANAGEMENT_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/tokens_management.hpp
- *  @brief Creation and management of the tokens list
- * 
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/generator/compound_node.hpp"
-#include "viennacl/generator/operation_types.hpp"
-#include "viennacl/generator/tree_operations.hpp"
-#include "viennacl/generator/meta_tools/typelist.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    /////////////////////////////
-    ///////// TOKENS ///////////
-    ////////////////////////////
-
-    template<class T, bool is_in_temporary_kernel, class TList = NullType, class TokenOp = add_type,  class Enable = void>
-    struct extract_tokens 
-    {
-      typedef TList Result;
-    };
-
-    template <class LHS, class OP, class RHS, bool is_temporary, bool is_in_temporary_kernel, class TList, class TokenOp>
-    struct extract_tokens<compound_node<LHS, OP, RHS, is_temporary>,
-                          is_in_temporary_kernel,
-                          TList,
-                          TokenOp> 
-    {
-      private:
-        typedef compound_node<LHS,OP,RHS,is_temporary> T;
-        typedef typename extract_tokens<LHS, is_in_temporary_kernel, TList, TokenOp>::Result LHS_Result;
-        typedef typename extract_tokens<RHS, is_in_temporary_kernel, TList, OP>::Result      RHS_Result;
-        typedef typename typelist_utils::fuse<RHS_Result,LHS_Result>::Result                 ResultFalse;
-        typedef typename typelist_utils::append<TList, std::pair<T,TokenOp> >::Result        ResulTrue;
-        
-      public:
-        typedef typename get_type_if<ResulTrue,ResultFalse,is_product_leaf<T>::value>::Result Result;
-    };
-
-    template <class TList,bool make_operator_inplace>
-    struct tokenize_operators 
-    {
-      private:
-        typedef typename TList::Head Head;
-        typedef typename get_type_if<typename make_inplace<typename Head::second_type>::Result,
-                                     assign_type,
-                                     make_operator_inplace>::Result   NewOperator;
-        typedef std::pair<typename Head::first_type, NewOperator>     NewHead;
-        typedef typename TList::Tail                                  Tail;
-        typedef typename tokenize_operators<Tail, true>::Result       NewTail;
-        
-      public:
-        typedef typelist<NewHead, NewTail> Result;
-    };
-
-    template <bool make_operator_inplace>
-    struct tokenize_operators<NullType,make_operator_inplace> 
-    {
-      typedef NullType Result;
-    };
-
-
-
-    template <class T, bool is_in_temporary_kernel>
-    struct generate_tokens;
-
-    template <class LHS,class OP, class RHS, bool is_temporary, bool is_in_temporary_kernel>
-    struct generate_tokens<compound_node<LHS,OP,RHS,is_temporary>, is_in_temporary_kernel> 
-    {
-      private:
-        typedef typename tree_utils::remove_if<RHS, is_product_leaf>::Result                NewTree;
-        typedef std::pair<NewTree,OP>                                                       LinearToken;
-        typedef typename extract_tokens<RHS, is_in_temporary_kernel>::Result                Products;
-        typedef typename tokenize_operators<Products,
-                                            !is_null_type<NewTree>::value>::Result         TokenizedProducts;
-
-      public:
-        typedef typelist<LinearToken,TokenizedProducts> Result;
-    };
-
-  }
-}
-#endif
diff --git a/viennacl/generator/traits/general_purpose_traits.hpp b/viennacl/generator/traits/general_purpose_traits.hpp
deleted file mode 100644
index d3387a2..0000000
--- a/viennacl/generator/traits/general_purpose_traits.hpp
+++ /dev/null
@@ -1,250 +0,0 @@
-#ifndef VIENNACL_GENERATOR_TRAITS_GENERAL_PURPOSE_TRAITS_HPP
-#define VIENNACL_GENERATOR_TRAITS_GENERAL_PURPOSE_TRAITS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/traits/general_purpose_traits.hpp
- *  @brief Provides a set of metafunctions for the identification of types
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include "viennacl/generator/operation_types.hpp"
-#include "viennacl/generator/forwards.h"
-#include "viennacl/generator/traits/result_of.hpp"
-#include "viennacl/generator/meta_tools/typelist.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-    template <class T>
-    struct is_scalar_expression_impl 
-    {
-      enum { value = 0 };
-    };
-
-    template <class T>
-    struct is_scalar_expression_impl<result_of::scalar_expression<T> > 
-    {
-      enum { value = 1};
-    };
-
-    template <class T>
-    struct is_scalar_expression 
-    {
-      enum { value = is_scalar_expression_impl<typename result_of::expression_type<T>::Result >::value };
-    };
-
-    template <class T>
-    struct is_temporary 
-    {
-      enum { value = 0 } ;
-    };
-
-    template <class LHS, class OP, class RHS>
-    struct is_temporary<compound_node<LHS,OP,RHS,true> > 
-    {
-      enum {value = 1};
-    };
-
-    template <class REF>
-    struct is_temporary<tmp_symbolic_vector<REF> > 
-    {
-      enum { value = 1};
-    };
-
-    template <class T>
-    struct is_temporary_kernel_parameter 
-    {
-      enum { value = is_temporary<T>::value };
-    };
-
-    template <class T>
-    struct is_temporary_kernel_parameter<inner_prod_impl_t<T> > 
-    {
-      enum { value = 1 };
-    };
-
-
-    template <class T>
-    struct is_regular_kernel_parameter 
-    {
-      enum { value = 0 };
-    };
-
-    template <unsigned int ID,class SCALARTYPE, unsigned int ALIGNMENT>
-    struct is_regular_kernel_parameter<symbolic_vector<ID,SCALARTYPE,ALIGNMENT> > 
-    {
-      enum { value = 1 };
-    };
-
-    template <unsigned int ID,class SCALARTYPE, class F, unsigned int ALIGNMENT>
-    struct is_regular_kernel_parameter<symbolic_matrix<ID,SCALARTYPE,F,ALIGNMENT> > 
-    {
-      enum { value = 1 };
-    };
-
-    template <unsigned int ID, class SCALARTYPE>
-    struct is_regular_kernel_parameter<cpu_symbolic_scalar<ID, SCALARTYPE> > 
-    {
-      enum { value = 1 };
-    };
-
-    template <unsigned int ID, class SCALARTYPE>
-    struct is_regular_kernel_parameter<gpu_symbolic_scalar<ID, SCALARTYPE> > 
-    {
-      enum { value = 1 };
-    };
-
-
-
-    template <class T>
-    struct is_pure_inner_product_leaf 
-    {
-      enum { value = 0};
-    };
-
-    template <class LHS,class RHS, bool is_temporary>
-    struct is_pure_inner_product_leaf<compound_node<LHS,inner_prod_type,RHS, is_temporary> > 
-    {
-      enum { value = 1};
-    };
-
-    template <class T>
-    struct is_inner_product_leaf 
-    {
-      enum { value = is_pure_inner_product_leaf<T>::value };
-    };
-
-    template <class LHS,class RHS>
-    struct is_inner_product_leaf<compound_node<LHS,scal_mul_type,RHS> > 
-    {
-      enum { value = ( is_inner_product_leaf<LHS>::value  && is_scalar_expression<RHS>::value && !is_inner_product_leaf<RHS>::value )
-                     || ( is_inner_product_leaf<RHS>::value && is_scalar_expression<LHS>::value &&!is_inner_product_leaf<LHS>::value )
-           };
-    };
-
-    template <class LHS,class RHS>
-    struct is_inner_product_leaf<compound_node<LHS,scal_div_type,RHS> >  
-    {
-      enum { value = ( is_inner_product_leaf<LHS>::value  && is_scalar_expression<RHS>::value && !is_inner_product_leaf<RHS>::value )
-                     || ( is_inner_product_leaf<RHS>::value && is_scalar_expression<LHS>::value &&!is_inner_product_leaf<LHS>::value )
-           };
-    };
-
-    template <class T>
-    struct is_pure_product_leaf 
-    {
-      enum { value = 0};
-    };
-
-    template <class LHS,class RHS, bool is_temporary>
-    struct is_pure_product_leaf<compound_node<LHS,prod_type,RHS, is_temporary> > 
-    {
-      enum { value = 1};
-    };
-
-    template <class T>
-    struct is_product_leaf 
-    {
-      enum { value = is_pure_product_leaf<T>::value };
-    };
-
-    template <class LHS,class RHS>
-    struct is_product_leaf<compound_node<LHS,scal_mul_type,RHS> > 
-    {
-      enum { value = is_product_leaf<LHS>::value
-                     ||is_product_leaf<RHS>::value
-           };
-    };
-
-    template <class T>
-    struct is_null_type 
-    {
-      enum { value = 0 };
-    };
-
-    template <>
-    struct is_null_type<NullType> 
-    {
-      enum { value = 1 };
-    };
-
-    template <class T>
-    struct is_compound 
-    {
-      enum { value = 0 } ;
-    };
-
-    template <class LHS, class OP, class RHS, bool is_temporary>
-    struct is_compound<compound_node<LHS,OP,RHS,is_temporary> > 
-    {
-      enum {value = 1};
-    };
-
-    template <class EXPR1, class EXPR2>
-    struct is_same_expression_type_impl 
-    {
-      enum { value = 0 };
-    };
-
-    template <class EXPR1, class DESCRIPTOR1, class EXPR2, class DESCRIPTOR2>
-    struct is_same_expression_type_impl<result_of::vector_expression<EXPR1,DESCRIPTOR1>,
-                                        result_of::vector_expression<EXPR2,DESCRIPTOR2> > 
-    {
-      private:
-        typedef result_of::vector_expression<EXPR1,DESCRIPTOR1> LHS;
-        typedef result_of::vector_expression<EXPR2,DESCRIPTOR2> RHS;
-      public:
-        enum { value = LHS::Alignment == RHS::Alignment };
-    };
-
-    template <class EXPR1, class LHS_DESCRIPTOR1, class RHS_DESCRIPTOR1,
-              class EXPR2, class LHS_DESCRIPTOR2, class RHS_DESCRIPTOR2>
-    struct is_same_expression_type_impl<result_of::matrix_expression<EXPR1,LHS_DESCRIPTOR1,RHS_DESCRIPTOR1>,
-                                        result_of::matrix_expression<EXPR2,LHS_DESCRIPTOR2,RHS_DESCRIPTOR2> > 
-    {
-      private:
-        typedef result_of::matrix_expression<EXPR1,LHS_DESCRIPTOR1,RHS_DESCRIPTOR1> LHS;
-        typedef result_of::matrix_expression<EXPR2,LHS_DESCRIPTOR2,RHS_DESCRIPTOR2> RHS;
-        
-      public:
-        enum { value = LHS::Alignment == RHS::Alignment };
-    };
-
-    template <class EXPR1, class EXPR2>
-    struct is_same_expression_type_impl<result_of::scalar_expression<EXPR1>,
-                                        result_of::scalar_expression<EXPR2> > 
-    {
-      enum { value = 1 };
-    };
-
-    template<class EXPR1, class EXPR2>
-    struct is_same_expression_type 
-    {
-      enum { value = is_same_expression_type_impl<typename result_of::expression_type<EXPR1>::Result,
-                                                  typename result_of::expression_type<EXPR2>::Result>::value
-           };
-    };
-
-  }
-}
-
-#endif
-
-
diff --git a/viennacl/generator/traits/result_of.hpp b/viennacl/generator/traits/result_of.hpp
deleted file mode 100644
index a4a7d59..0000000
--- a/viennacl/generator/traits/result_of.hpp
+++ /dev/null
@@ -1,591 +0,0 @@
-#ifndef VIENNACL_GENERATOR_TRAITS_RESULT_OF_HPP
-#define VIENNACL_GENERATOR_TRAITS_RESULT_OF_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/traits/result_of.hpp
- *  @brief Provides a set of metafunctions for type deductions within the kernel generator framework.
- *
- *  Generator code contributed by Philippe Tillet
- */
-
-#include <string>
-
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-#include "viennacl/generator/forwards.h"
-#include "viennacl/generator/meta_tools/utils.hpp"
-#include "viennacl/generator/elementwise_modifier.hpp"
-#include "viennacl/ocl/local_mem.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/forwards.h"
-#include "CL/cl.h"
-
-namespace viennacl
-{
-  namespace generator
-  {
-    namespace result_of 
-    {
-
-      class runtime_wrapper
-      {
-        protected:
-	        bool is_temporary_;
-	        std::string name_;
-	        int arg_id_;
-	        
-        public:
-	        runtime_wrapper(bool _is_temporary, std::string const & _name, int _arg_id) 
-	         : is_temporary_(_is_temporary), name_(_name), arg_id_(_arg_id) {}
-	         
-	        bool is_temporary() const { return is_temporary_; }
-	        int arg_id() const { return arg_id_; }
-	        std::string name() const { return name_; }
-	        
-	        virtual void enqueue(unsigned int arg_pos, 
-	                             viennacl::ocl::kernel & k,
-	                             std::map<unsigned int, viennacl::any> & runtime_args,
-	                             std::map<std::string, viennacl::ocl::handle<cl_mem> > & temporaries) = 0;
-      };
-
-      class shared_memory_wrapper : public runtime_wrapper
-      {
-        public:
-	        shared_memory_wrapper() : runtime_wrapper(true, "shared_memory_ptr", -1 ){ }
-	
-	        void enqueue(unsigned int arg_pos,
-	                     viennacl::ocl::kernel & k,
-	                     std::map<unsigned int, viennacl::any> & runtime_args,
-	                     std::map<std::string, viennacl::ocl::handle<cl_mem> > & temporaries)
-	        {
-		        unsigned int lmem_size = k.local_work_size();
-		        k.arg(arg_pos, viennacl::ocl::local_mem(lmem_size*sizeof(float)));
-	        }
-	
-      };
-
-      template <class T, class SIZE_T>
-      struct vector_runtime_wrapper : public runtime_wrapper 
-      {
-        private:
-          unsigned int size_id_;
-          
-          template<typename ScalarType, unsigned int Alignment>
-          typename SIZE_T::size_type size(viennacl::vector<ScalarType,Alignment> * size_arg) { return size_arg->size(); }
-
-          template<typename ScalarType, class F, unsigned int Alignment>
-          typename SIZE_T::size_type size(viennacl::matrix<ScalarType,F,Alignment> * size_arg) { return size_arg->size2(); }
-          
-          template<typename ScalarType, unsigned int Alignment>
-          typename SIZE_T::size_type internal_size(viennacl::vector<ScalarType,Alignment> * size_arg) { return size_arg->internal_size(); }
-
-          template<typename ScalarType, class F, unsigned int Alignment>
-          typename SIZE_T::size_type internal_size(viennacl::matrix<ScalarType,F,Alignment> * size_arg) { return size_arg->internal_size2(); }
-          
-    public:
-          vector_runtime_wrapper(bool _is_temporary, std::string const & _name, int _arg_id, unsigned int _size_id) 
-            : runtime_wrapper(_is_temporary,_name,_arg_id),size_id_(_size_id) {}
-            
-          void enqueue(unsigned int arg_pos,
-                       viennacl::ocl::kernel & k,
-                       std::map<unsigned int, viennacl::any> & runtime_args,
-                       std::map<std::string, 
-                       viennacl::ocl::handle<cl_mem> > & temporaries)
-          { 
-            SIZE_T * size_arg = viennacl::any_cast<SIZE_T * >(runtime_args[size_id_]);
-            viennacl::ocl::handle<cl_mem> handle = NULL;
-            if(is_temporary_)
-            {
-	            if(temporaries.find(name_)==temporaries.end())
-	            {
-		            temporaries.insert(
-		             std::make_pair(name_,
-		                            viennacl::ocl::handle<cl_mem>(
-		                              viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-		                                                                             size_arg->internal_size()*sizeof(typename T::value_type))
-		                                                                            )
-		                           )
-		                              );
-	            }
-	            handle = temporaries[name_];
-            }
-            else
-            {
-	            T * current_arg = viennacl::any_cast<T * >(runtime_args[arg_id_]);
-	            handle = current_arg->handle();
-            }
-            k.arg(arg_pos, handle );
-            k.arg(arg_pos+1,cl_uint(size(size_arg)));
-            k.arg(arg_pos+2,cl_uint(internal_size(size_arg)));
-          }
-      };
-
-      template <class T, class SIZE_DESCRIPTOR>
-      struct vector_expression : public runtime_wrapper
-      {
-        typedef T type;
-        typedef typename SIZE_DESCRIPTOR::ScalarType ScalarType;
-        static const unsigned int Alignment = SIZE_DESCRIPTOR::Alignment;
-
-        static runtime_wrapper * runtime_descriptor()
-        {
-          return new vector_runtime_wrapper<viennacl::vector<ScalarType,Alignment>,
-                                            typename SIZE_DESCRIPTOR::runtime_type>(viennacl::generator::is_temporary<T>::value,
-                                                                                    T::name(),
-                                                                                    T::id,SIZE_DESCRIPTOR::id);
-        }
-        
-        static const std::string size_expression() 
-        {
-          return SIZE_DESCRIPTOR::size2_name();
-        }
-        
-        static const std::string internal_size_expression() 
-        {
-          return SIZE_DESCRIPTOR::internal_size2_name() + "/" + to_string(Alignment);
-        }
-      };
-
-      template <class T, class SIZE1_T, class SIZE2_T>
-      struct matrix_runtime_wrapper : public runtime_wrapper
-      {
-        private:
-	        unsigned int size1_id_;
-	        unsigned int size2_id_;
-        public:
-          matrix_runtime_wrapper(bool _is_temporary, 
-                                 std::string const & _name,
-                                 int _arg_id,
-                                 unsigned int _size1_id,
-                                 unsigned int _size2_id) 
-                                : runtime_wrapper(_is_temporary,_name,_arg_id), size1_id_(_size1_id), size2_id_(_size2_id) {}
-                                
-          unsigned int n_elements(){ return size1_id_*size2_id_; }
-          
-          void enqueue(unsigned int arg_pos,
-                       viennacl::ocl::kernel & k,
-                       std::map<unsigned int, viennacl::any> & runtime_args,
-                       std::map<std::string,
-                       viennacl::ocl::handle<cl_mem> > & temporaries)
-          { 
-	          if (is_temporary_) {}
-	          
-	          T * current_arg = any_cast<T * >(runtime_args[arg_id_]);
-	          SIZE1_T * size1_arg = any_cast<SIZE1_T * >(runtime_args[size1_id_]);
-	          SIZE2_T * size2_arg = any_cast<SIZE2_T * >(runtime_args[size2_id_]);
-	          k.arg(arg_pos, current_arg->handle());
-	          k.arg(arg_pos+1,cl_uint(size1_arg->size1()));
-	          k.arg(arg_pos+2,cl_uint(size2_arg->size2()));
-	          k.arg(arg_pos+3,cl_uint(size1_arg->internal_size1()));
-	          k.arg(arg_pos+4,cl_uint(size2_arg->internal_size2()));
-          }
-      };
-          
-      template <class T, class SIZE1_DESCRIPTOR, class SIZE2_DESCRIPTOR>
-      struct matrix_expression 
-      {
-        typedef typename SIZE1_DESCRIPTOR::ScalarType ScalarType;
-        typedef typename SIZE1_DESCRIPTOR::Layout Layout;
-        static const unsigned int Alignment = SIZE1_DESCRIPTOR::Alignment;
-        
-        static runtime_wrapper * runtime_descriptor()
-        {
-          return new matrix_runtime_wrapper<viennacl::matrix<ScalarType,Layout,Alignment>,
-                                            typename SIZE1_DESCRIPTOR::runtime_type,
-                                            typename SIZE2_DESCRIPTOR::runtime_type>(is_temporary<T>::value,T::name(),
-                                                                                     T::id,SIZE1_DESCRIPTOR::id,
-                                                                                     SIZE2_DESCRIPTOR::id);
-        }
-        
-        static const std::string size1_expression() 
-        {
-          return SIZE1_DESCRIPTOR::size1_name();
-        }
-
-        static const std::string size2_expression() 
-        {
-          return SIZE2_DESCRIPTOR::size2_name();
-        }
-
-        static const std::string internal_size1_expression() 
-        {
-          return SIZE1_DESCRIPTOR::internal_size1_name() + "/" + to_string(Alignment);
-        }
-
-        static const std::string internal_size2_expression() 
-        {
-          return SIZE2_DESCRIPTOR::internal_size2_name() + "/" + to_string(Alignment);
-        }
-
-        typedef T type;
-      };
-
-      template <class T>
-      struct scalar_size_descriptor
-      {
-	      static unsigned int size(viennacl::ocl::kernel & k) { return 1; }
-      };
-
-      template <class LHS, class RHS, bool is_temporary>
-      struct scalar_size_descriptor<compound_node<LHS,inner_prod_type,RHS,is_temporary> >
-      {
-	      static unsigned int size(viennacl::ocl::kernel & k)
-	      {
-		      return k.global_work_size(0)/k.local_work_size(0);
-	      }
-      };
-
-      template <class T>
-      struct scalar_runtime_wrapper: public runtime_wrapper
-      {
-        typedef typename T::ScalarType ScalarType;
-        
-        scalar_runtime_wrapper(bool _is_temporary, std::string const & _name, int _arg_id) : runtime_wrapper(_is_temporary,_name,_arg_id){}
-        
-        void enqueue(unsigned int arg_pos,
-                     viennacl::ocl::kernel & k,
-                     std::map<unsigned int,
-                     viennacl::any> & runtime_args, 
-                     std::map<std::string, 
-                     viennacl::ocl::handle<cl_mem> > & temporaries)
-        {
-		      if(is_temporary_)
-		      {
-			      if(temporaries.find(name_)==temporaries.end()) 
-			      {
-				      temporaries.insert(
-  				         std::make_pair(name_,
-                                  viennacl::ocl::handle<cl_mem>(
-                                  viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                 scalar_size_descriptor<T>::size(k)*sizeof(ScalarType))
-                                                               )
-                                 )
-                               );
-			      }
-			      k.arg(arg_pos, temporaries[name_]);
-		      }
-		      
-		      if(arg_id_==-2)
-			      k.arg(arg_pos, temporaries[name_]);
-		      else
-		      {
-			      viennacl::scalar<ScalarType>* current_arg = any_cast<viennacl::scalar<ScalarType> * >(runtime_args[arg_id_]);
-			      k.arg(arg_pos, current_arg->handle());
-		      }
-		
-        }
-      };
-
-      template <unsigned int ID, class ScalarType>
-      struct scalar_runtime_wrapper<viennacl::generator::cpu_symbolic_scalar<ID, ScalarType> >: public runtime_wrapper
-      {
-        scalar_runtime_wrapper(bool _is_temporary, std::string const & _name, int _arg_id) : runtime_wrapper(_is_temporary,_name,_arg_id){ }
-        
-        void enqueue(unsigned int arg_pos,
-                     viennacl::ocl::kernel & k,
-                     std::map<unsigned int, viennacl::any> & runtime_args,
-                     std::map<std::string, viennacl::ocl::handle<cl_mem> > & temporaries)
-        {
-          ScalarType* current_arg = any_cast<ScalarType * >(runtime_args[arg_id_]);
-          k.arg(arg_pos, cl_float(*current_arg));
-        }
-      };
-          
-      template <class T>
-      struct scalar_expression 
-      {
-        typedef typename T::ScalarType ScalarType;
-        
-        static runtime_wrapper * runtime_descriptor()
-        {
-          return new scalar_runtime_wrapper<T>(is_temporary<T>::value,T::name(),T::id);
-        }
-      };
-
-      /*
-       * Compound Nodes - General case
-       */
-      template <class T>
-      struct expression_type 
-      {
-        typedef NullType Result;
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary>
-      struct expression_type<compound_node<LHS,OP,RHS,is_temporary> > 
-      {
-        private:
-          typedef typename expression_type<LHS>::Result LHS_Result;
-          typedef typename expression_type<RHS>::Result RHS_Result;
-          
-        public:
-          typedef typename expression_type<compound_node<LHS_Result, OP, RHS_Result,is_temporary> >::Result Result;
-      };
-
-      /*
-       * Compound Nodes - usual operators
-       */
-      template <class LHS, class LHS_SIZE_DESCRIPTOR ,class OP ,class RHS, class RHS_SIZE_DESCRIPTOR ,bool is_temporary>
-      struct expression_type<compound_node<vector_expression<LHS,LHS_SIZE_DESCRIPTOR>,
-                                           OP,
-                                           vector_expression<RHS,RHS_SIZE_DESCRIPTOR>,
-                                           is_temporary>
-                            >
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-          
-        public:
-          typedef vector_expression<T, LHS_SIZE_DESCRIPTOR> Result;
-      };
-
-
-      template <class LHS, class LHS_SIZE1_DESCRIPTOR, class LHS_SIZE2_DESCRIPTOR,
-                class OP,
-                class RHS, class RHS_SIZE1_DESCRIPTOR, class RHS_SIZE2_DESCRIPTOR,
-                bool is_temporary>
-      struct expression_type<compound_node<matrix_expression<LHS, LHS_SIZE1_DESCRIPTOR, LHS_SIZE2_DESCRIPTOR>,
-                                           OP,
-                                           matrix_expression<RHS, RHS_SIZE1_DESCRIPTOR, RHS_SIZE2_DESCRIPTOR>,
-                                           is_temporary> 
-                             > 
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-          
-        public:
-          typedef matrix_expression<T, LHS_SIZE1_DESCRIPTOR, LHS_SIZE2_DESCRIPTOR> Result;
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary>
-      struct expression_type<compound_node<scalar_expression<LHS>, 
-                                           OP,
-                                           scalar_expression<RHS>,
-                                           is_temporary> 
-                            > 
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-          
-        public:
-          typedef scalar_expression<T> Result;
-      };
-
-      /*
-       * Scalar Operators
-       */
-      template <class LHS, class LHS_SIZE_DESCRIPTOR,
-                class OP,
-                class RHS,
-                bool is_temporary>
-      struct  expression_type<compound_node<vector_expression<LHS,LHS_SIZE_DESCRIPTOR>,
-                                            OP,
-                                            scalar_expression<RHS>,
-                                            is_temporary> > 
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-        public:
-          typedef vector_expression<T, LHS_SIZE_DESCRIPTOR> Result;
-      };
-
-      template <class LHS,
-                class OP,
-                class RHS, class RHS_SIZE_DESCRIPTOR,
-                bool is_temporary>
-      struct expression_type<compound_node<scalar_expression<LHS>,
-                                           OP,
-                                           vector_expression<RHS,RHS_SIZE_DESCRIPTOR>,
-                                           is_temporary>
-                            > 
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-        public:
-          typedef vector_expression<T, RHS_SIZE_DESCRIPTOR> Result;
-      };
-
-
-      template <class LHS, class LHS_SIZE1_DESCRIPTOR, class LHS_SIZE2_DESCRIPTOR,
-                class OP,
-                class RHS, bool is_temporary>
-      struct expression_type<compound_node<matrix_expression<LHS,LHS_SIZE1_DESCRIPTOR,LHS_SIZE2_DESCRIPTOR>,
-                                           OP,
-                                           scalar_expression<RHS>,
-                                           is_temporary>
-                            > 
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-        public:
-          typedef matrix_expression<T, LHS_SIZE1_DESCRIPTOR, LHS_SIZE2_DESCRIPTOR> Result;
-      };
-
-      template <class LHS, 
-                class OP,
-                class RHS, class RHS_SIZE1_DESCRIPTOR, class RHS_SIZE2_DESCRIPTOR,
-                bool is_temporary>
-      struct expression_type<compound_node<scalar_expression<LHS>,
-                                           OP,
-                                           matrix_expression<RHS,RHS_SIZE1_DESCRIPTOR, RHS_SIZE2_DESCRIPTOR>,
-                                           is_temporary>
-                            >
-      {
-        private:
-          typedef compound_node<LHS ,OP, RHS, is_temporary> T;
-        public:
-          typedef matrix_expression<T, RHS_SIZE1_DESCRIPTOR, RHS_SIZE2_DESCRIPTOR> Result;
-      };
-
-
-      /*
-       * Compound Nodes - Non Trivial Operators
-       */
-
-      //Matrix-Vector product
-      template <class LHS, class LHS_SIZE1_DESCRIPTOR, class LHS_SIZE2_DESCRIPTOR,
-                class RHS, class RHS_SIZE_DESCRIPTOR,
-                bool is_temporary>
-      struct expression_type<compound_node<matrix_expression<LHS, LHS_SIZE1_DESCRIPTOR, LHS_SIZE2_DESCRIPTOR>,
-                                           prod_type,
-                                           vector_expression<RHS,RHS_SIZE_DESCRIPTOR>,
-                                           is_temporary> 
-                            >
-      {
-        typedef vector_expression<compound_node<LHS,prod_type,RHS,is_temporary>, LHS_SIZE1_DESCRIPTOR > Result;
-      };
-
-      template <class T>
-      struct expression_type<inner_prod_impl_t<T> >
-      {
-	      typedef scalar_expression<T> Result;
-      };
-
-      //Matrix-Matrix product
-      template <class LHS, class LHS_SIZE1_DESCRIPTOR, class LHS_SIZE2_DESCRIPTOR,
-                class RHS, class RHS_SIZE1_DESCRIPTOR, class RHS_SIZE2_DESCRIPTOR,
-                bool is_temporary>
-      struct expression_type<compound_node<matrix_expression<LHS, LHS_SIZE1_DESCRIPTOR, LHS_SIZE2_DESCRIPTOR>,
-                                           prod_type,
-                                           matrix_expression<RHS,RHS_SIZE1_DESCRIPTOR,RHS_SIZE2_DESCRIPTOR>,
-                                           is_temporary> 
-                            >
-      {
-        typedef matrix_expression<compound_node<LHS,prod_type,RHS,is_temporary>, LHS_SIZE1_DESCRIPTOR, RHS_SIZE2_DESCRIPTOR > Result;
-      };
-
-      //Inner product
-      template <class LHS, class LHS_SIZE_DESCRIPTOR,
-                class RHS, class RHS_SIZE_DESCRIPTOR,
-                bool is_temporary>
-      struct expression_type< compound_node<vector_expression<LHS,LHS_SIZE_DESCRIPTOR>,
-                                            inner_prod_type,
-                                            vector_expression<RHS,RHS_SIZE_DESCRIPTOR>,
-                                            is_temporary>
-                            >
-      {
-        typedef scalar_expression<compound_node<LHS,inner_prod_type,RHS,is_temporary> > Result;
-      };
-
-
-      /*
-       * Elementwise Modifiers
-       */
-      template <class T, std::string (*U)()>
-      struct expression_type< elementwise_modifier_impl<T,U> > 
-      {
-        typedef typename expression_type<T>::Result Result;
-      };
-
-      template <class T, class SIZE_DESCRIPTOR>
-      struct expression_type< vector_expression<T,SIZE_DESCRIPTOR> > 
-      {
-        typedef typename expression_type<T>::Result Result;
-      };
-
-      template <class T, class SIZE1_DESCRIPTOR, class SIZE2_DESCRIPTOR>
-      struct expression_type< matrix_expression<T,SIZE1_DESCRIPTOR,SIZE2_DESCRIPTOR> > 
-      {
-        typedef typename expression_type<T>::Result Result;
-      };
-
-      template <class T>
-      struct expression_type< scalar_expression<T> > 
-      {
-        typedef typename expression_type<T>::Result Result;
-      };
-
-      /*
-       * Symbolic Vectors
-       */
-
-      template <unsigned int ID,typename SCALARTYPE, unsigned int ALIGNMENT>
-      struct expression_type< symbolic_vector<ID,SCALARTYPE,ALIGNMENT> > 
-      {
-        typedef vector_expression<symbolic_vector<ID,SCALARTYPE,ALIGNMENT>,
-                                  symbolic_vector<ID,SCALARTYPE,ALIGNMENT> > Result;
-      };
-
-      template <class Ref>
-      struct  expression_type<tmp_symbolic_vector<Ref> > 
-      {
-        typedef vector_expression<tmp_symbolic_vector<Ref>, Ref> Result;
-      };
-
-      /*
-       * Symbolic Matrices
-       */
-
-      template <unsigned int ID,typename SCALARTYPE, class F, unsigned int ALIGNMENT>
-      struct expression_type<symbolic_matrix<ID,SCALARTYPE,F,ALIGNMENT> > 
-      {
-        private:
-          typedef symbolic_matrix<ID,SCALARTYPE,F,ALIGNMENT> T;
-        public:
-          typedef matrix_expression<T, T, T> Result;
-      };
-
-      template <class Ref>
-      struct expression_type<tmp_symbolic_matrix<Ref> > 
-      {
-        typedef matrix_expression<tmp_symbolic_matrix<Ref>, Ref, Ref > Result;
-      };
-
-      /*
-       * Symbolic Scalars
-       */
-
-      template <unsigned int ID, typename SCALARTYPE>
-      struct expression_type<cpu_symbolic_scalar<ID, SCALARTYPE> > 
-      {
-        typedef scalar_expression<cpu_symbolic_scalar<ID, SCALARTYPE> > Result;
-      };
-
-      template <unsigned int ID, typename SCALARTYPE>
-      struct expression_type<gpu_symbolic_scalar<ID, SCALARTYPE> > 
-      {
-        typedef scalar_expression< gpu_symbolic_scalar<ID, SCALARTYPE> > Result;
-      };
-
-
-    }
-  }
-}
-
-#endif
-
-
diff --git a/viennacl/generator/tree_operations.hpp b/viennacl/generator/tree_operations.hpp
deleted file mode 100644
index 53ec317..0000000
--- a/viennacl/generator/tree_operations.hpp
+++ /dev/null
@@ -1,487 +0,0 @@
-#ifndef VIENNACL_GENERATOR_TREE_OPERATIONS_HPP
-#define VIENNACL_GENERATOR_TREE_OPERATIONS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/generator/tree_operations.hpp
- *  @brief Functors for modifying the expression tree.
- * 
- *  Generator code contributed by Philippe Tillet
- */
-
-
-#include "viennacl/generator/elementwise_modifier.hpp"
-#include "viennacl/generator/traits/general_purpose_traits.hpp"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-    namespace tree_utils 
-    {
-
-      /*
-      * Count if
-      */
-
-      template <class T, template<class> class Pred>
-      struct count_if 
-      {
-        enum { value = Pred<T>::value };
-      };
-
-      template <class T, std::string (*U)(), template<class> class Pred>
-      struct count_if<elementwise_modifier_impl<T,U>, Pred>
-      {
-        enum { value = Pred<T>::value + count_if<T, Pred>::value };
-      };
-
-      template<class T, template<class> class Pred>
-      struct count_if<inner_prod_impl_t<T>, Pred> 
-      {
-        enum { value = Pred<inner_prod_impl_t<T> >::value + count_if<T, Pred>::value };
-      };
-
-
-      template<class LHS, class RHS, class OP, bool is_temporary, template<class> class Pred>
-      struct count_if<compound_node<LHS,OP,RHS,is_temporary>,Pred>
-      {
-        private:
-          typedef compound_node<LHS,OP,RHS,is_temporary> T;
-          
-        public:
-          enum { value = Pred<T>::value
-                        +  count_if<LHS, Pred>::value
-                        +  count_if<RHS, Pred>::value
-                };
-      };
-
-
-      /*
-      * Count if type
-      */
-
-      template<class T, class Searched>
-      struct count_if_type 
-      {
-        enum { value = 0 };
-      };
-
-      template<class T>
-      struct count_if_type<T,T> 
-      {
-        enum { value = 1 };
-      };
-
-      template<class T, std::string (*U)(), class Searched>
-      struct count_if_type<elementwise_modifier_impl<T,U>, Searched> 
-      {
-        enum { value = count_if_type<T, Searched>::value };
-      };
-
-      template <class T, std::string (*U)()>
-      struct count_if_type<elementwise_modifier_impl<T,U>, elementwise_modifier_impl<T,U> > 
-      {
-        enum { value = 1 + count_if_type<T, elementwise_modifier_impl<T,U> >::value };
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary>
-      struct count_if_type<compound_node<LHS, OP, RHS, is_temporary>,
-                           compound_node<LHS, OP, RHS, is_temporary> > 
-      {
-        private:
-          typedef compound_node<LHS, OP, RHS, is_temporary> T;
-        public:
-          enum { value = 1 +  count_if_type<LHS, T>::value
-                           +  count_if_type<RHS, T>::value
-               };
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary, class Searched>
-      struct count_if_type< compound_node<LHS,OP,RHS,is_temporary>, Searched> 
-      {
-        enum { value = count_if_type<LHS, Searched>::value
-                      +  count_if_type<RHS, Searched>::value
-             };
-      };
-
-
-      /*
-      * Expand
-      */
-
-      template <class LHS, class OP, bool is_temporary, class RHS_LHS, class RHS_OP, class RHS_RHS, bool RHS_is_temporary>
-      struct expand_right 
-      {
-          typedef compound_node< compound_node<LHS, OP, RHS_LHS, RHS_is_temporary>,
-                                 RHS_OP,
-                                 compound_node<LHS, OP, RHS_RHS, RHS_is_temporary>,
-                                 is_temporary>   Result;
-      };
-
-      template <class LHS_LHS, class LHS_OP, class LHS_RHS, bool LHS_is_temporary, class OP, class RHS, bool is_temporary>
-      struct expand_left 
-      {
-          typedef compound_node< compound_node<LHS_LHS, OP, RHS, LHS_is_temporary>,
-                                 LHS_OP,
-                                 compound_node<LHS_RHS, OP, RHS, LHS_is_temporary>,
-                                 is_temporary>        Result;
-      };
-
-      template <class T>
-      struct expand 
-      {
-        typedef T Result;
-      };
-
-      template <class T, std::string (*U)()>
-      struct expand< elementwise_modifier_impl<T,U> > 
-      {
-        private:
-          typedef typename expand<T>::Result                 SUB_Result;
-        public:
-          typedef elementwise_modifier_impl<SUB_Result,U>    Result;
-      };
-
-      template<class T>
-      struct expand<inner_prod_impl_t<T> > 
-      {
-        private:
-          typedef typename expand<T>::Result      SUB_Result;
-        public:
-          typedef inner_prod_impl_t<SUB_Result>   Result;
-      };
-
-
-      template<class LHS,class OP,class RHS,bool is_temporary>
-      struct expand< compound_node<LHS,OP,RHS,is_temporary> >
-      {
-        typedef compound_node<typename expand<LHS>::Result, OP, typename expand<RHS>::Result, is_temporary>   Result;
-      };
-
-      #define make_right_expandable(__OPERATOR1__ , __OPERATOR2__) \
-                      template<class LHS, class RHS_LHS, class RHS_RHS, bool RHS_is_temporary, bool is_temporary>\
-                      struct expand< compound_node<LHS, __OPERATOR1__, compound_node<RHS_LHS, __OPERATOR2__, RHS_RHS, RHS_is_temporary>, is_temporary> >\
-                      {\
-                        typedef typename expand_right<typename expand<LHS>::Result\
-                                                    , __OPERATOR1__\
-                                                    , is_temporary\
-                                                    , typename expand<RHS_LHS>::Result\
-                                                    , __OPERATOR2__\
-                                                    , typename expand<RHS_RHS>::Result\
-                                                    , RHS_is_temporary>::Result Result;\
-                      }
-
-      #define make_left_expandable(__OPERATOR1__ , __OPERATOR2__) \
-                      template<class LHS_LHS, class LHS_RHS, bool LHS_is_temporary, class RHS, bool is_temporary>\
-                      struct expand< compound_node< compound_node<LHS_LHS, __OPERATOR2__ , LHS_RHS , LHS_is_temporary>\
-                                                    , __OPERATOR1__\
-                                                    , RHS\
-                                                    , is_temporary> >\
-                      {\
-                        typedef typename expand_left< typename expand<LHS_LHS>::Result\
-                                                    , __OPERATOR2__\
-                                                    , typename expand<LHS_RHS>::Result\
-                                                    , LHS_is_temporary\
-                                                    , __OPERATOR1__\
-                                                    , typename expand<RHS>::Result\
-                                                    , is_temporary\
-                                                      >	::Result Result;\
-                      }
-
-      make_right_expandable ( scal_mul_type,add_type );
-      make_right_expandable ( scal_mul_type,sub_type );
-      make_left_expandable ( scal_mul_type,add_type );
-      make_left_expandable ( scal_mul_type,sub_type );
-
-
-      #undef make_left_expandable
-      #undef make_right_expandable
-
-      ////////////////////////////
-      // REGISTER TEMPORARIES  //
-      ///////////////////////////
-
-      template <class T>
-      struct make_temporary;
-
-      template <unsigned int ID, class SCALARTYPE, unsigned int ALIGNMENT>
-      struct make_temporary<symbolic_vector<ID,SCALARTYPE,ALIGNMENT> > 
-      {
-        typedef tmp_symbolic_vector< symbolic_vector<ID,SCALARTYPE,ALIGNMENT> > Result;
-      };
-
-      template <unsigned int ID,typename SCALARTYPE, class F, unsigned int ALIGNMENT>
-      struct make_temporary<symbolic_matrix<ID,SCALARTYPE,F,ALIGNMENT> > {
-        typedef tmp_symbolic_matrix< symbolic_matrix<ID,SCALARTYPE,F,ALIGNMENT> > Result;
-      };
-
-      template <class T, bool only_first_order, class Assigned = void, bool is_nested = false>
-      struct register_temporaries 
-      {
-        typedef T Result;
-      };
-
-      template <class T, bool only_first_order>
-      struct register_temporaries<T, only_first_order, T, true> 
-      {
-        typedef typename make_temporary<T>::Result     Result;
-      };
-
-      template <class T, std::string (*U)(), bool only_first_order, class Assigned, bool is_nested>
-      struct register_temporaries<elementwise_modifier_impl<T,U>, only_first_order, Assigned, is_nested> 
-      {
-        private:
-          typedef typename register_temporaries<T, only_first_order, Assigned, is_nested>::Result   SUB_Result;
-        public:
-          typedef elementwise_modifier_impl<SUB_Result,U>     Result;
-      };
-
-
-      template <class LHS, class OP, class RHS, bool is_temporary, bool only_first_order, class Assigned, bool is_nested>
-      struct register_temporaries<compound_node<LHS,OP,RHS,is_temporary>, only_first_order, Assigned, is_nested> 
-      {
-        private:
-          typedef compound_node<LHS,OP,RHS,is_temporary> T;
-          static const bool is_non_trivial =  is_pure_product_leaf<T>::value ||is_pure_inner_product_leaf<T>::value;
-          typedef typename register_temporaries<LHS, only_first_order, Assigned, is_nested || is_non_trivial>::Result LHS_Result;
-          typedef typename register_temporaries<RHS, only_first_order, Assigned, is_nested || is_non_trivial>::Result RHS_Result;
-
-          typedef compound_node<LHS_Result,OP,RHS_Result, is_non_trivial&& ( is_temporary || is_nested )  > RecursiveResult;
-          typedef compound_node<LHS,OP,RHS,true> EarlyStoppingResult;
-        public:
-          typedef typename get_type_if<EarlyStoppingResult, RecursiveResult,is_non_trivial && only_first_order && is_nested>::Result Result;
-      };
-
-
-      ////////////////////////////////
-      //////// EXTRACTIF ////////
-      ///////////////////////////////
-
-
-      template <class T, 
-                template<class> class Pred,
-                template<class, class> class Comp = typelist_utils::true_comp,
-                class TList = NullType>
-      struct extract_if 
-      {
-        private:
-          typedef typelist<T,TList>    TypeTrue;
-          typedef NullType             TypeFalse;
-        public:
-          typedef typename get_type_if<TypeTrue, TypeFalse, Pred<T>::value>::Result      Result;
-      };
-
-      template <class T,
-                std::string (*U)(),
-                template<class> class Pred,
-                template<class,class> class Comp,
-                class TList>
-      struct extract_if<elementwise_modifier_impl<T,U>, Pred, Comp, TList> 
-      {
-        private:
-          typedef typename extract_if<T, Pred, Comp, TList>::Result         SUB_Result;
-        public:
-          typedef typename typelist_utils::fuse<TList,SUB_Result>::Result   Result;
-      };
-
-      template <class T,
-                template<class> class Pred,
-                template<class,class> class Comp,
-                class TList>
-      struct extract_if<inner_prod_impl_t<T>, Pred, Comp, TList > 
-      {
-        private:
-          typedef typename T::LHS LHS;
-          typedef typename T::RHS RHS;
-          typedef typename extract_if<LHS, Pred, Comp, TList>::Result            LHS_Result;
-          typedef typename extract_if<RHS,  Pred, Comp, TList>::Result           RHS_Result;
-          typedef typename typelist_utils::fuse<TList, LHS_Result>::Result       TmpResult1;
-          typedef typename typelist_utils::fuse<TmpResult1, RHS_Result>::Result  TmpResult2;
-          
-          typedef TmpResult2                                                                        TypeFalse;
-          typedef typename typelist_utils::append<TmpResult2, inner_prod_impl_t<T> >::Result        TypeTrue;
-          
-        public:
-          typedef typename get_type_if<TypeTrue, TypeFalse, Pred< inner_prod_impl_t<T> >::value>::Result   Result;
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary,
-                template<class> class Pred,
-                template<class,class> class Comp,
-                class TList>
-      struct extract_if< compound_node<LHS, OP, RHS, is_temporary>, Pred, Comp, TList>
-      {
-        private:
-          typedef compound_node<LHS,OP,RHS,is_temporary> T;
-          typedef typename extract_if<LHS,Pred,Comp,TList>::Result LHS_Result;
-          typedef typename extract_if<RHS,Pred,Comp,TList>::Result RHS_Result;
-
-          typedef typename typelist_utils::fuse< typename typelist_utils::fuse<TList, LHS_Result, Comp>::Result,
-                                                 RHS_Result, 
-                                                 Comp >::Result     TypeFalse;
-          typedef typelist<T, TList>                                TypeTrue;
-        public:
-          typedef typename get_type_if<TypeTrue, TypeFalse, Pred<T>::value>::Result       Result;
-      };
-
-
-      ///////////////////////////////
-      //////// FLIP_TREE  ///////////
-      ///////////////////////////////
-
-      template <class OP, bool flip>
-      struct invert_flip 
-      {
-        enum { value = flip };
-      };
-
-      template <bool flip>
-      struct invert_flip<sub_type, flip> 
-      {
-        enum { value = !flip };
-      };
-
-      template <class OP, bool flip>
-      struct flip_operator 
-      {
-          typedef OP Result;
-      };
-
-      template <>
-      struct flip_operator<sub_type, true> 
-      {
-          typedef add_type Result;
-      };
-
-      template <>
-      struct flip_operator<add_type, true> 
-      {
-          typedef sub_type Result;
-      };
-
-      template <class T, bool flip = false>
-      struct flip_tree 
-      {
-          typedef T Result;
-      };
-
-      template <class T, std::string (*U)(),  bool flip>
-      struct flip_tree <elementwise_modifier_impl<T,U>, flip> 
-      {
-        private:
-          typedef typename flip_tree<T, flip>::Result       SUB_Result;
-        public:
-          typedef elementwise_modifier_impl<SUB_Result,U>   Result;
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary, bool flip>
-      struct flip_tree< compound_node<LHS, OP, RHS, is_temporary>, flip>
-      {
-        private:
-          typedef typename flip_tree<LHS,flip>::Result LHS_Result;
-          typedef typename flip_tree<RHS, invert_flip<OP, flip>::value >::Result RHS_Result;
-
-        public:
-          typedef compound_node<LHS_Result, typename flip_operator<OP, flip>::Result , RHS_Result, is_temporary> Result;
-      };
-
-      ////////////////////////////////
-      //////// REMOVE_IF ////////////
-      ///////////////////////////////
-
-      template <class OP, class RHS>
-      struct handle_unary_minus
-      {
-        typedef RHS Result;
-      };
-
-      template <class RHS>
-      struct handle_unary_minus<sub_type, RHS> 
-      {
-        typedef compound_node<NullType,sub_type,RHS> Result;
-      };
-
-      template <class T>
-      struct compound_to_simple 
-      {
-        typedef T Result;
-      };
-
-      template <class LHS, class OP>
-      struct compound_to_simple<compound_node<LHS, OP, NullType> > 
-      {
-        typedef LHS Result;
-      };
-
-      template <class OP, class RHS>
-      struct compound_to_simple<compound_node<NullType, OP, RHS> > 
-      {
-        typedef typename handle_unary_minus<OP,RHS>::Result Result;
-      };
-
-      template <class OP, class RHS, class Enable=void>
-      struct get_new_operator 
-      {
-        typedef OP Result;
-      };
-
-      template <class RHS_OP, class RHS_RHS>
-      struct get_new_operator <sub_type, compound_node<NullType, RHS_OP, RHS_RHS> >
-      {
-        typedef RHS_OP Result;
-      };
-
-      template <class T, template<class> class Pred>
-      struct remove_if 
-      {
-        typedef typename get_type_if<NullType,T,Pred<T>::value>::Result    Result;
-        typedef typename get_type_if<NullType,T,Pred<T>::value>::Result    TmpTree;
-      };
-
-      template <class T, std::string (*U)(), template<class> class Pred>
-      struct remove_if<elementwise_modifier_impl<T,U>,Pred > 
-      {
-        typedef elementwise_modifier_impl<typename remove_if<T,Pred>::Result, U> Result;
-      };
-
-      template <class LHS, class OP, class RHS, bool is_temporary, template<class> class Pred>
-      struct remove_if<compound_node<LHS,OP,RHS,is_temporary>, Pred> 
-      {
-        private:
-          typedef compound_node<LHS,OP,RHS,is_temporary> T;
-
-          typedef typename remove_if<LHS,Pred>::TmpTree LHS_TmpTree;
-          typedef typename remove_if<RHS,Pred>::TmpTree RHS_TmpTree;
-
-          typedef typename compound_to_simple<typename remove_if<LHS,Pred>::Result>::Result LHS_Result;
-          typedef typename compound_to_simple<typename remove_if<RHS,Pred>::Result>::Result RHS_Result;
-
-          typedef compound_node<LHS_TmpTree,OP,RHS_TmpTree> TmpTree0;
-          typedef typename compound_to_simple<compound_node<LHS_Result,
-                                                            typename get_new_operator<OP,RHS_TmpTree>::Result,
-                                                            RHS_Result,
-                                                            is_temporary> >::Result    Result0;
-        public:
-          typedef typename get_type_if<NullType, TmpTree0,  Pred<T>::value>::Result    TmpTree;
-          typedef typename get_type_if<NullType, Result0,   Pred<T>::value>::Result    Result;
-      };
-      
-    }  // namespace tree_utils
-  } // namespace generator
-} // namespace viennacl
-#endif
diff --git a/viennacl/linalg/coordinate_matrix_operations.hpp b/viennacl/linalg/coordinate_matrix_operations.hpp
deleted file mode 100644
index 8dd03e8..0000000
--- a/viennacl/linalg/coordinate_matrix_operations.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-#ifndef VIENNACL_COORDINATE_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_COORDINATE_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file coordinate_matrix_operations.hpp
-    @brief Implementations of operations using coordinate_matrix
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
-
-namespace viennacl
-{
-  namespace linalg
-  {
-    
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a coordinate_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                viennacl::op_prod > prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                                              const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                                              size_t NUM_THREADS)
-    {
-      return viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               viennacl::op_prod >(mat, vec);
-    }
-    
-    //namespace {
-    /** @brief Carries out matrix-vector multiplication with a coordinate_matrix
-    *
-    * Implementation of the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param result The result vector
-    */
-      template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(const viennacl::coordinate_matrix<TYPE, ALIGNMENT> & mat, 
-                     const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec,
-                           viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-        result.clear();
-        
-        //std::cout << "prod(coordinate_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::coordinate_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
-        unsigned int thread_num = 256; //k.local_work_size(0);
-        
-        k.local_work_size(0, thread_num);
-        
-        k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
-        //k.global_work_size(0, thread_num);  //Only one work group
-        viennacl::ocl::enqueue(k(mat.handle12(), mat, mat.handle3(),
-                                 vec,
-                                 result,
-                                 viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
-                                 viennacl::ocl::local_mem(sizeof(TYPE)*thread_num)) );
-
-      }
-    //};
-
-  } //namespace linalg
-
-
-
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
-    }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
-
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
-
-} //namespace viennacl
-
-
-#endif
diff --git a/viennacl/linalg/lanczos.hpp~ b/viennacl/linalg/lanczos.hpp~
deleted file mode 100644
index a4f83df..0000000
--- a/viennacl/linalg/lanczos.hpp~
+++ /dev/null
@@ -1,490 +0,0 @@
-#ifndef VIENNACL_LINALG_LANCZOS_HPP_
-#define VIENNACL_LINALG_LANCZOS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/lanczos.hpp
-*   @brief Generic interface for the Lanczos algorithm.
-* 
-*   Contributed by Guenther Mader and Astrid Rupp.
-*/
-
-#include <math.h>    //for sqrt()
-#include <vector>
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"
-#include "viennacl/linalg/bisect.hpp"
-#include <boost/random.hpp>
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/matrix_expression.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-#include <boost/numeric/ublas/operation.hpp> 
-#include <boost/numeric/ublas/vector_expression.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-namespace viennacl
-{
-  namespace linalg 
-  {
-    
-    /** @brief A tag for the lanczos algorithm. 
-    */
-    class lanczos_tag 
-    {
-      public:
-        
-        enum
-        {
-          partial_reorthogonalization = 0,
-          full_reorthogonalization,
-          no_reorthogonalization
-        };
-
-        /** @brief The constructor
-        *
-        * @param factor                 Exponent of epsilon - tolerance for batches of Reorthogonalization
-        * @param num_eigenvalues        Number of eigenvalues to be returned
-        * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
-        * @param krylov_size            Maximal krylov-space size
-        */
-
-        lanczos_tag(double factor = 0.75,
-                    std::size_t numeig = 10,
-                    int met = 0,
-                    std::size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {};
-
-        /** @brief Sets the number of eigenvalues */
-        void num_eigenvalues(int numeig){ num_eigenvalues_ = numeig; }
-
-          /** @brief Returns the number of eigenvalues */
-        std::size_t num_eigenvalues() const { return num_eigenvalues_; }
-
-          /** @brief Sets the exponent of epsilon */
-        void factor(double fct) { factor_ = fct; }
-
-        /** @brief Returns the exponent */
-        double factor() const { return factor_; }
-        
-        /** @brief Sets the size of the kylov space */
-        void krylov_size(int max) { krylov_size_ = max; }
-
-        /** @brief Returns the size of the kylov space */  
-        std::size_t  krylov_size() const { return krylov_size_; }
-
-        /** @brief Sets the reorthogonalization method */ 
-        void method(int met){ method_ = met; }
-        
-        /** @brief Returns the reorthogonalization method */ 
-        int method() const { return method_; }
-
-
-      private: 
-        double factor_;
-        std::size_t num_eigenvalues_;
-        int method_; // see enum defined above for possible values
-        std::size_t krylov_size_;
-
-    };
-    
-
-    /** 
-    *   @brief Implementation of the calculation of eigenvalues using lanczos
-    *   
-    *   @param matrix        The system matrix
-    *   @param tag           Tag with several options for the lanczos algorithm
-    *   @return              Returns the n largest eigenvalues (n defined in the lanczos_tag)
-    */
-    template< typename MatrixT >
-    std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
-    eig(MatrixT const & matrix, lanczos_tag const & tag)
-    {
-      typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
-      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-      typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
-    
-      boost::mt11213b mt;
-      boost::normal_distribution<double> N(0, 1);
-      boost::bernoulli_distribution<double> B(0.5);
-      boost::triangle_distribution<double> T(-1, 0, 1);
-
-      boost::variate_generator<boost::mt11213b&, boost::normal_distribution<double> >     get_N(mt, N);
-      boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<double> >  get_B(mt, B);
-      boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<double> >   get_T(mt, T);
-      
-      std::vector<CPU_ScalarType> eigenvalues;
-      std::size_t matrix_size = matrix.size1();
-      VectorT r(matrix_size);
-      std::vector<CPU_ScalarType> s(matrix_size);
-      
-      for(std::size_t i=0; i<s.size(); ++i)
-        s[i] = 3.0 * get_B() + get_T() - 1.5; 
-
-      detail::copy_vec_to_vec(s,r);
-
-      std::size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
-                                                                  : tag.krylov_size();
-      
-      switch(tag.method())
-      {
-        case lanczos_tag::partial_reorthogonalization:
-          eigenvalues = detail::lanczosPRO(matrix, r, size_krylov, tag);
-          break;
-        case lanczos_tag::full_reorthogonalization:
-          eigenvalues = detail::lanczosFRO(matrix, r, size_krylov, tag);
-          break;
-        case lanczos_tag::no_reorthogonalization:
-          eigenvalues = detail::lanczos(matrix, r, size_krylov, tag);
-          break;                
-      }
-
-      std::vector<CPU_ScalarType> largest_eigenvalues;
-
-      for(std::size_t i = 1; i<=tag.num_eigenvalues(); i++)
-        largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
-    
-    
-      return largest_eigenvalues;
-    }
-    
-    
-    namespace detail
-    {
-      /** 
-      *   @brief Implementation of the Lanczos PRO algorithm
-      *   
-      *   @param A            The system matrix
-      *   @param r            Random start vector 
-      *   @param size         Size of krylov-space
-      *   @param tag          Lanczos_tag with several options for the algorithm
-      *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
-      */
-
-      template< typename MatrixT, typename VectorT >
-      std::vector<
-              typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
-              >
-      lanczosPRO (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
-      {
-    
-        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
-        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-
-        
-        // generation of some random numbers, used for lanczos PRO algorithm
-        boost::mt11213b mt;
-        boost::normal_distribution<double> N(0, 1);
-        boost::bernoulli_distribution<double> B(0.5);
-        boost::triangle_distribution<double> T(-1, 0, 1);
-
-        boost::variate_generator<boost::mt11213b&, boost::normal_distribution<double> >     get_N(mt, N);
-        boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<double> >  get_B(mt, B);
-        boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<double> >   get_T(mt, T);
-
-        
-        long i, j, k, index, retry, reorths;
-        std::vector<long> l_bound(size/2), u_bound(size/2);
-        bool second_step;
-        double squ_eps, eta, temp, eps, retry_th;
-        long n = r.size();
-        std::vector< std::vector<CPU_ScalarType> > w(2, std::vector<CPU_ScalarType>(size));
-        CPU_ScalarType cpu_beta;
-
-        boost::numeric::ublas::vector<CPU_ScalarType> s(n);
-
-        VectorT t(n);
-        CPU_ScalarType inner_rt;
-        ScalarType vcl_beta;
-        ScalarType vcl_alpha;
-        std::vector<CPU_ScalarType> alphas, betas;
-        boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
-
-        second_step = false;
-        eps = std::numeric_limits<double>::epsilon();
-        squ_eps = sqrt(eps);
-        retry_th = 1e-2;
-        eta =  exp(log(eps) * tag.factor());
-        reorths = 0;
-        retry = 0;
-        
-        vcl_beta = viennacl::linalg::norm_2(r);
-        
-        r /= vcl_beta;
-        
-        detail::copy_vec_to_vec(r,s);
-        boost::numeric::ublas::column(Q, 0) = s;
-        
-        VectorT u = viennacl::linalg::prod(A, r);
-        vcl_alpha = viennacl::linalg::inner_prod(u, r);
-        alphas.push_back(vcl_alpha);
-        w[0][0] = 1;
-        betas.push_back(vcl_beta);
-        
-        long batches = 0;
-        for(i = 1;i < size; i++)
-        {
-          r = u - vcl_alpha * r;
-          vcl_beta = viennacl::linalg::norm_2(r);
-
-          betas.push_back(vcl_beta);
-          r = r / vcl_beta;
-
-          index = i % 2;
-          w[index][i] = 1;
-          k = (i + 1) % 2;
-          w[index][0] = (betas[1] * w[k][1] + (alphas[0] - vcl_alpha) * w[k][0] - betas[i - 1] * w[index][0]) / vcl_beta + eps * 0.3 * get_N() * (betas[1] + vcl_beta);
-          
-          for(j = 1;j < i - 1;j++)
-          {
-                  w[index][j] = (betas[j + 1] * w[k][j + 1] + (alphas[j] - vcl_alpha) * w[k][j] + betas[j] * w[k][j - 1] - betas[i - 1] * w[index][j]) / vcl_beta + eps * 0.3 * get_N() * (betas[j + 1] + vcl_beta);	  
-          }
-          w[index][i - 1] = 0.6 * eps * n * get_N() * betas[1] / vcl_beta;
-
-          if(second_step)
-          {
-            for(j = 0;j < batches;j++)
-            {
-              l_bound[j]++;
-              u_bound[j]--;
-
-              for(k = l_bound[j];k < u_bound[j];k++)
-              {
-                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
-                inner_rt = viennacl::linalg::inner_prod(r,t);
-                r = r - inner_rt * t;	
-                w[index][k] = 1.5 * eps * get_N();
-                reorths++;
-              }
-            }
-            temp = viennacl::linalg::norm_2(r);
-            r = r / temp;
-            vcl_beta = vcl_beta * temp;
-            second_step = false;
-          }
-          batches = 0;
-
-          for(j = 0;j < i;j++)
-          { 
-            if(fabs(w[index][j]) >= squ_eps)
-            {
-              detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, j), t);
-              inner_rt = viennacl::linalg::inner_prod(r,t);
-              r = r - inner_rt * t;
-              w[index][j] = 1.5 * eps * get_N();
-              k = j - 1;
-              reorths++;
-              while(k >= 0 && fabs(w[index][k]) > eta)
-              {
-                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
-                inner_rt = viennacl::linalg::inner_prod(r,t);
-                r = r - inner_rt * t;
-                w[index][k] = 1.5 * eps * get_N();
-                k--;
-                reorths++;
-              }
-              l_bound[batches] = k + 1;
-              k = j + 1;
-              
-              while(k < i && fabs(w[index][k]) > eta)
-              {
-                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
-                inner_rt = viennacl::linalg::inner_prod(r,t);
-                r = r - inner_rt * t;	
-                w[index][k] = 1.5 * eps * get_N();
-                k++;
-                reorths++;
-              }
-              u_bound[batches] = k - 1;
-              batches++;
-              j = k;
-            }
-          }
-          
-          if(batches > 0)
-          {
-            temp = viennacl::linalg::norm_2(r);
-            r = r / temp;
-            vcl_beta = vcl_beta * temp;
-            second_step = true;
-
-            while(temp < retry_th)
-            {
-              for(j = 0;j < i;j++)
-              {
-                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
-                inner_rt = viennacl::linalg::inner_prod(r,t);
-                r = r - inner_rt * t;
-                reorths++;
-              }
-              retry++;
-              temp = viennacl::linalg::norm_2(r);
-              r = r / temp;
-              vcl_beta = vcl_beta * temp;
-            }
-          }
-      
-          detail::copy_vec_to_vec(r,s);
-          boost::numeric::ublas::column(Q, i) = s;
-
-          cpu_beta = vcl_beta;
-          s = - cpu_beta * boost::numeric::ublas::column(Q, i - 1);
-          detail::copy_vec_to_vec(s, u);
-          u += viennacl::linalg::prod(A, r);
-          vcl_alpha = viennacl::linalg::inner_prod(u, r);
-          alphas.push_back(vcl_alpha);
-        }
-
-        return bisect(alphas, betas);
-      
-      }
-
-
-      /** 
-      *   @brief Implementation of the lanczos algorithm without reorthogonalization
-      * 
-      *   @param A            The system matrix
-      *   @param r            Random start vector 
-      *   @param size         Size of krylov-space
-      *   @param tag          Lanczos_tag with several options for the algorithm
-      *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
-      */
-      template< typename MatrixT, typename VectorT >
-      std::vector<
-              typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
-              >
-      lanczos (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
-      {
-      
-        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
-        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-
-        long i;
-        ScalarType vcl_beta;
-        ScalarType vcl_alpha;
-        std::vector<CPU_ScalarType> alphas, betas;
-        CPU_ScalarType norm;
-        long n = r.size();
-        VectorT u(n), t(n);
-        boost::numeric::ublas::vector<CPU_ScalarType> s(r.size()), u_zero(n), q(n);
-        boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
-
-        u_zero = boost::numeric::ublas::zero_vector<CPU_ScalarType>(n);
-        detail::copy_vec_to_vec(u_zero, u);
-        norm = norm_2(r);
-        
-        for(i = 0;i < size; i++)
-        {
-          r /= norm;
-          vcl_beta = norm;
-
-          detail::copy_vec_to_vec(r,s);
-          boost::numeric::ublas::column(Q, i) = s;
-
-          u += prod(A, r);
-          vcl_alpha = inner_prod(u, r);
-          r = u - vcl_alpha * r;
-          norm = norm_2(r);
-
-          q = boost::numeric::ublas::column(Q, i);
-          detail::copy_vec_to_vec(q, t);
-
-          u = - norm * t;
-          alphas.push_back(vcl_alpha);
-          betas.push_back(vcl_beta);
-          s.clear();
-        }
-
-        return bisect(alphas, betas);
-      }
-
-      /** 
-      *   @brief Implementation of the Lanczos FRO algorithm
-      *   
-      *   @param A            The system matrix 
-      *   @param r            Random start vector 
-      *   @param size         Size of krylov-space
-      *   @param tag          Lanczos_tag with several options for the algorithm
-      *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
-      */
-      template< typename MatrixT, typename VectorT >
-      std::vector<
-              typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
-              >
-      lanczosFRO (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
-      {
-        
-        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
-        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-        
-          CPU_ScalarType temp;
-          CPU_ScalarType norm;
-          ScalarType vcl_beta;
-          ScalarType vcl_alpha;
-          std::vector<CPU_ScalarType> alphas, betas;
-          long n = r.size();
-          VectorT u(n), t(n);
-          ScalarType inner_rt;
-          boost::numeric::ublas::vector<CPU_ScalarType> u_zero(n), s(r.size()), q(n);
-          boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
-          
-          long reorths = 0;
-          norm = norm_2(r);
-
-
-          for(long i = 0; i < size; i++)
-          {
-            r /= norm;
-
-            for(long j = 0; j < i; j++)
-            {
-              q = boost::numeric::ublas::column(Q, j);
-              detail::copy_vec_to_vec(q, t);
-              inner_rt = viennacl::linalg::inner_prod(r,t);
-              r = r - inner_rt * t;
-              reorths++;
-            }
-            temp = viennacl::linalg::norm_2(r);
-            r = r / temp;
-            vcl_beta = temp * norm;
-            detail::copy_vec_to_vec(r,s);
-            boost::numeric::ublas::column(Q, i) = s;
-
-            u += viennacl::linalg::prod(A, r);
-            vcl_alpha = viennacl::linalg::inner_prod(u, r);
-            r = u - vcl_alpha * r;
-            norm = viennacl::linalg::norm_2(r);
-            q = boost::numeric::ublas::column(Q, i);
-            detail::copy_vec_to_vec(q, t);
-            u = - norm * t;
-            alphas.push_back(vcl_alpha);
-            betas.push_back(vcl_beta);
-          }
-          
-          return bisect(alphas, betas);
-      }
-
-    } // end namespace detail
-    
-  } // end namespace linalg
-} // end namespace viennacl
-#endif
\ No newline at end of file
diff --git a/viennacl/tools/matrix_kernel_class_deducer.hpp b/viennacl/tools/matrix_kernel_class_deducer.hpp
deleted file mode 100644
index b942c51..0000000
--- a/viennacl/tools/matrix_kernel_class_deducer.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef VIENNACL_TOOLS_MATRIX_KERNEL_CLASS_DEDUCER_HPP_
-#define VIENNACL_TOOLS_MATRIX_KERNEL_CLASS_DEDUCER_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix_kernel_class_deducer.hpp
-    @brief Implementation of a helper meta class for deducing the correct kernels for the supplied matrix
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/linalg/kernels/matrix_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_row_kernels.h"
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    /**     @brief Implementation of a helper meta class for deducing the correct kernels for the supplied matrix */
-    template <typename MatrixType1>
-    struct MATRIX_KERNEL_CLASS_DEDUCER
-    {};
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    //support for matrix range:
-    template <typename T>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix_range<T> >
-    {
-      typedef typename MATRIX_KERNEL_CLASS_DEDUCER<T>::ResultType    ResultType;
-    };
-    
-    //support for matrix slice:
-    template <typename T>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix_slice<T> >
-    {
-      typedef typename MATRIX_KERNEL_CLASS_DEDUCER<T>::ResultType    ResultType;
-    };
-    
-  }
-
-}
-
-#endif
diff --git a/viennacl/tools/matrix_prod_kernel_class_deducer.hpp b/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
deleted file mode 100644
index 3c3b6f9..0000000
--- a/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#ifndef VIENNACL_TOOLS_MATRIX_PROD_KERNEL_CLASS_DEDUCER_HPP_
-#define VIENNACL_TOOLS_MATRIX_PROD_KERNEL_CLASS_DEDUCER_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix_prod_kernel_class_deducer.hpp
-    @brief Implementation of a helper meta class for deducing the correct kernels for matrix-matrix products
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_row_kernels.h"
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    namespace detail
-    {
-      template <typename MatrixType>
-      struct extract_matrix
-      {
-        typedef typename MatrixType::ERROR_UNKNOWN_MATRIX_TYPE_PROVIDED   error_type;
-      };
-      
-      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-      struct extract_matrix < viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-      {
-        typedef viennacl::matrix<SCALARTYPE, F, ALIGNMENT>   type;
-      };
-
-      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-      struct extract_matrix < const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-      {
-        typedef viennacl::matrix<SCALARTYPE, F, ALIGNMENT>   type;
-      };
-
-      
-      template <typename MatrixType>
-      struct extract_matrix < viennacl::matrix_range<MatrixType> >
-      {
-        typedef typename extract_matrix<MatrixType>::type   type;
-      };
-
-      template <typename MatrixType>
-      struct extract_matrix < const viennacl::matrix_range<MatrixType> >
-      {
-        typedef typename extract_matrix<MatrixType>::type   type;
-      };
-      
-      template <typename MatrixType>
-      struct extract_matrix < viennacl::matrix_slice<MatrixType> >
-      {
-        typedef typename extract_matrix<MatrixType>::type   type;
-      };
-
-      template <typename MatrixType>
-      struct extract_matrix < const viennacl::matrix_slice<MatrixType> >
-      {
-        typedef typename extract_matrix<MatrixType>::type   type;
-      };
-      
-    }
-    
-    
-    
-    /** @brief deduces kernel type for C=A*B, where A, B, C are MatrixType1, MatrixType2 and MatrixType3 respectively */
-    template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER
-    {
-      typedef typename MATRIX_PROD_KERNEL_CLASS_DEDUCER< typename detail::extract_matrix<MatrixType1>::type,
-                                                         typename detail::extract_matrix<MatrixType2>::type,
-                                                         typename detail::extract_matrix<MatrixType3>::type>::ResultType   ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_row_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_row_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_col_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_col_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_row_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_row_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_col_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_col_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-  }
-
-}
-
-#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/viennacl.git